In [None]:
# Libraries Introduce 引入数据库
import pandas as pd
import numpy as np
import copy
import shap
import pymrmr
import seaborn as sns 
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rcParams["font.sans-serif"] = ["Times New Roman"] # change font as Times New Roman

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score, train_test_split, StratifiedKFold, GridSearchCV
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, roc_curve, auc, roc_auc_score, RocCurveDisplay

from collections import defaultdict
from rdkit import Chem
from rdkit import DataStructs

#### Dataset Input 数据读取

In [None]:
# Dataset input 数据导入
path = "../Data/131 Exhaustion to PLA.xlsx"
nams = pd.read_excel(path, index_col = 0).index.values  # Dyes name introduce 读取染料名

# Feature input 读取特征数据
mac_fp = pd.read_csv("..\Data\model_maccs.csv").iloc[:,1:]
pub_fp = pd.read_csv("..\Data\model_pubchem.csv").iloc[:,1:] 
sub_fp = pd.read_csv("..\Data\model_substructure.csv").iloc[:,1:]
suc_fp = pd.read_csv("..\Data\model_substructure count.csv").iloc[:,1:]
est_fp = pd.read_csv("..\Data\model_estate.csv").iloc[:,1:]

# Dyes name setting 设置染料名
sub_fp.index = est_fp.index = mac_fp.index = pub_fp.index = suc_fp.index = nams
esm_fp = pd.concat([mac_fp, pub_fp, sub_fp, suc_fp, est_fp], axis = 1) 

# Categories of high- or low-exhaustion染料分类
exhs = pd.read_excel(path, index_col = 0)["Exhaustion"]  # Exhaustion reading 读取上染率
cate = [int(score//80) for score in exhs]  # Set high- or low-exhaustion threshold as 0.80 上染率高低阈值为 0.8

#### Dataset Pretreatment 数据预处理

In [None]:
# Screening out features with 0 variance 筛除方差为0(所有值相同)的特征
var = VarianceThreshold(threshold = 0)
esm_fp2 = var.fit_transform(esm_fp)
esm_fp2 = esm_fp.loc[:, var.get_support()]

# Screening out features with pearson coefficient > 0.90 筛除皮尔森相关系数为0.90的特征
corr_matrix = esm_fp2.corr(numeric_only = True).abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k = 1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]
esm_fp3 = esm_fp2.drop(to_drop, axis = 1)

#### Frequently Used Tool Definition 常用小组件

In [None]:
SKFold = StratifiedKFold(n_splits = 10, shuffle = True, random_state = 42)  # 10-fold stratified cross-validation
SKFold_5 =  StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42) # 05-fold stratified cross-validation
criterion = {"accuracy":"ACC", "f1_weighted":"F1", "roc_auc":"AUC"}  # Validation metrics 评价标准参数

# Splitting train- and test-set 训练、测试集分割
fp_tra, fp_tes, cate_tra, cate_tes = train_test_split(esm_fp3, cate, train_size = 0.8, random_state = 41, stratify = cate)

#### Function Definetion 定义函数

In [None]:
# mRMR function 定义mRMR函数
def mrmr_selection(feature, target, num: int):
    mrmr_feas = []
    feature_mrmr = feature.copy()
    feature_mrmr.insert(loc = 0,column = "Category",value = target)
    mrmr_feas = pymrmr.mRMR(feature_mrmr, 'MIQ', num)
    return mrmr_feas

# Stratified cross-validation function 定义分层交叉验证性能函数
def skfold_perf(n, model, X_data, y_data):
    performance = []
    SKFold = StratifiedKFold(n_splits = n, shuffle = True, random_state = 42)  # N-fold stratified cross-validation n折分层交叉验证
    criterion = {"accuracy":"ACC", "f1_weighted":"F1", "roc_auc":"AUC"}  # Validation metrics 评价标准参数
    for scoring in criterion:
        answer = cross_val_score(model, X_data, np.array(y_data).ravel(), cv = SKFold, scoring = scoring)
        performance.append(answer.mean().round(3))
    return performance

# Confusion matrix drawing function 定义混淆矩阵绘图函数
def cm_drawing(n:int, model, X_data, y_data:list): # N: fold of stratified cross-validation n:分层交叉验证折数
    X = X_data.astype("float")
    Y = np.array(y_data).ravel()
    
    predictions = defaultdict(list)  # 建立空字典记录
    
    SKFold = StratifiedKFold(n_splits = n, shuffle = True, random_state = 42)  # 交叉验证方法
    for train_index, test_index in SKFold.split(X, Y):  
        X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]  
        Y_train, Y_test = Y[train_index], Y[test_index] 
        model.fit(X_train, Y_train)  
        Y_pred = model.predict(X_test)  
        predictions['pred'].extend(Y_pred)  # record key and value 记录字典中键值对
        predictions['test'].extend(Y_test) 
        
    cm = confusion_matrix(predictions["test"], predictions["pred"])  # confusion matrix 混淆矩阵   
    sns.set_theme(style = "whitegrid",font = 'Times New Roman', font_scale = 2)  # drawing via seaborn seaborn绘制混淆矩阵
    plt.figure()  
    sns.heatmap(cm, annot = True, fmt = ".2g", cmap = "viridis", linewidths = 3, vmin = 0, vmax = 60,
                linecolor = "white", square = True)
    plt.xlabel("Predicted Class", fontsize = 20)  
    plt.ylabel("Experimental Class", fontsize = 20)  
    plt.show()

# ROC curve drawing function ROC曲线绘制函数
def cv_roc_drawing(n:int, model, X_data, y_data): # n 折数；model 训练完成的模型；X_data, y_data 数据
    tprs = []  # Recording True Positive Rate 记录真正率
    aucs = []  # Recording AUC value 记录AUC值
    mean_fpr = np.linspace(0, 1, 100)
    
    SKF = StratifiedKFold(n_splits = n, shuffle = True, random_state = 42)
    fig, ax = plt.subplots(figsize = (15, 15))
    for i, (train_index, test_index) in enumerate(SKF.split(X_data, y_data)):
        X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]  
        Y_train, Y_test = Y[train_index], Y[test_index] 

        model.fit(X_train, Y_train)
        viz = RocCurveDisplay.from_estimator(model, X_test, Y_test, name = "ROC fold{}".format(i), alpha = 0.5,
                                             lw = 4, ax = ax)
        interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
        interp_tpr[0] = 0.0
        tprs.append(interp_tpr)
        aucs.append(viz.roc_auc)

    mean_tpr = np.mean(tprs, axis = 0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.std(aucs)

    ax.plot(mean_fpr, mean_tpr, color = "blue", label = r"Mean ROC(AUC = %0.2f ± %0.2f)" % (mean_auc, std_auc),
            lw = 8, alpha = 0.9)

    std_tpr = np.std(tprs, axis = 0)
    tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
    tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
    ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color = "grey", alpha = .2, label = "± 1 std. dev.")

    ax.set(xlim = [-0.05, 1.05], ylim = [-0.05, 1.05])
    ax.axis("square")
    ax.grid()  # Dropping grid line 去除网格线
    ax.legend(loc = "lower right")
    ticks = [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
    ax.set_xlabel("False Positive Rate", labelpad = 15, fontsize = 50, fontweight = "bold") # Distance of line and caption 拉开轴标题与刻度线距离
    ax.set_ylabel("True Positive Rate", labelpad = 15, fontsize = 50, fontweight = "bold")
    ax.set_xticks(ticks, labels = ticks, fontsize = 50, fontweight = "bold") # Scale setting 设置刻度
    ax.set_yticks(ticks, labels = ticks, fontsize = 50, fontweight = "bold")
    plt.setp(ax.spines.values(), linewidth = 4, color = "black") # Line thickness setting 设置边框粗细
    plt.plot([0, 1], [0, 1], linestyle = "--", color = "black", alpha = 0.8, lw = 4) # Diagonal drawing 绘制对角线
    plt.tick_params(axis = "both", length = 10, width = 4)  # Scale thickness and longth setting 设置刻度的粗细和长度
    # plt.savefig(f"{n}-fold ROC.png", dpi = 300)  # Save pic before show() 保存图片，在show()之前

    plt.show()

#### 10-fold Stratified Cross-validation 10折分层交叉验证

In [None]:
X = fp_tra.astype("float") 
Y = np.array(cate_tra).ravel()
GB = GradientBoostingClassifier(random_state = 42)

# Performance output 验证性能
print("原始训练集10折分层交叉验证性能:{}".format(skfold_perf(10, GB, X, Y)))

# Confusion matrix 混淆矩阵
cm_drawing(10, GB, X, Y)

#### Feature Engineering 特征筛选

In [None]:
# Performance when fea_num = [10:40:10] and [5:15:1] 特征数在[10:40:10]、[5:15:1]的模型性能
for fn in range(10, 41, 10):
    prin_fp = mrmr_selection(fp_tra, cate_tra, fn)
    fe_tra = fp_tra.loc[:,prin_fp] 
    sel_pef = []  # Matainning variables 维护变量
    for scoring in criterion:
        answer = cross_val_score(GB, fe_tra, np.array(cate_tra).ravel(), cv = SKFold, scoring = scoring)
        sel_pef.append(answer.mean().round(3))

In [None]:
# Best fea_num 11 最佳特征数为 11
prin_fp = mrmr_selection(fp_tra, cate_tra, 11)
fe_tra = fp_tra.loc[:,prin_fp] 

#### Model Optimization 模型优化

#### Final Model 最终模型

In [None]:
X = fe_tra.astype("float") 
Y = np.array(cate_tra).ravel()
GB = GradientBoostingClassifier(random_state = 42, learning_rate = 0.1, min_samples_split =13, 
                                min_samples_leaf = 1,max_depth = 5, max_features = 3, subsample = 0.8, 
                                n_estimators = 40)

#### 10- and 5-fold Stratified Cross-validation 10折、5折分层交叉

In [None]:
# 10-fold 10折
ans_10 = skfold_perf(10, GB, X, Y)  # Validation performance 验证性能
print("Acc:{}, F1 score:{}, AUC:{}".format(ans_10[0], ans_10[1], ans_10[2]))
print("10-fold confusion matrix:")
cm_drawing(10, GB, X, Y)  # confusion matrix 混淆矩阵

In [None]:
# 05-fold 05折
ans_05 = skfold_perf(5, GB, X, Y)  # Validation performance 验证性能
print("Acc:{}, F1:{}, AUC:{}".format(ans_05[0], ans_05[1], ans_05[2]))
print("05-fold confusion matrix:")
cm_drawing(5, GB, X, Y) 

#### 10- and 5-fold Stratified Cross-validation ROC 10折、5折分层交叉ROC曲线

In [None]:
cv_roc_drawing(10, GB, X, Y)
cv_roc_drawing(5, GB, X, Y)

#### Test-set Validation 测试集验证

In [None]:
GB = GradientBoostingClassifier(random_state = 42, learning_rate = 0.1, min_samples_split =13, 
                                min_samples_leaf = 1,max_depth = 5, max_features = 3, subsample = 0.8, 
                                n_estimators = 40)
GB.fit(fe_tra, cate_tra)
cate_pre = GB.predict(fp_tes[fe_tra.columns])
cate_score = GB.predict_proba(fp_tes[fe_tra.columns])[:, 1]  # Categories possibility 样本的类别预测概率

print("Acc:{:.2f}, F1 score:{:.2f}, AUC:{:.2f}".format(accuracy_score(cate_tes, cate_pre), 
                                                   f1_score(cate_tes, cate_pre, average = "weighted"),
                                                   roc_auc_score(cate_tes, cate_score, average = "macro")))

cm_t = confusion_matrix(cate_tes, cate_pre)  # Confusion matrix 混淆矩阵
print(cm_t)

#### Test-set Validation ROC 测试集ROC曲线

In [None]:
plt.figure(figsize = (15, 15))
ax = plt.gca()
# ax.plot([0, 1], [0, 1], linestyle = "--", color = "black", label = "Chance level")
GB_ROC = RocCurveDisplay.from_estimator(GB, fp_tes[fe_tra.columns], cate_tes, color = "blue",ax = ax,
                                        lw = 8, plot_chance_level = False)
ax.grid()
plt.legend(loc = "lower right")  # show legend and location setting 显示图例并控制位置
ticks = [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
ax.set_xlabel("False Positive Rate", labelpad = 15, fontsize = 50, fontweight = "bold") # Distance of line and caption 拉开轴标题与刻度线距离
ax.set_ylabel("True Positive Rate", labelpad = 15, fontsize = 50, fontweight = "bold")
ax.set_xticks(ticks, labels = ticks, fontsize = 50, fontweight = "bold") # Scale setting 设置刻度
ax.set_yticks(ticks, labels = ticks, fontsize = 50, fontweight = "bold")
plt.setp(ax.spines.values(), linewidth = 4, color = "black") # Line thickness setting 设置边框粗细
plt.plot([0, 1], [0, 1], linestyle = "--", color = "black", alpha = 0.8, lw = 4) # Diagonal drawing 绘制对角线
plt.tick_params(axis = "both", length = 10, width = 4)  # Scale thickness and length setting 设置刻度的粗细和长度
# plt.savefig("Test ROC.png", dpi = 300)
plt.show()

#### External-set Validation 外部集验证

In [None]:
GB = GradientBoostingClassifier(random_state = 42, learning_rate = 0.1, min_samples_split =13, 
                                min_samples_leaf = 1,max_depth = 5, max_features = 3, subsample = 0.8, 
                                n_estimators = 40)
GB.fit(fe_tra, cate_tra)
# External-set Input 读取外部集描述符
emac_fp = pd.read_csv("../Fingerprints/e_maccs.csv").iloc[:,1:] 
epub_fp = pd.read_csv("../Fingerprints/e_pubchem.csv").iloc[:,1:] 
esub_fp = pd.read_csv("../Fingerprints/e_substructure.csv").iloc[:,1:]
esuc_fp = pd.read_csv("../Fingerprints/e_substructure count.csv").iloc[:,1:]

# Features intergration 整合描述符
eesm_fp = pd.concat([emac_fp, epub_fp, esub_fp, esuc_fp], axis = 1) 

# External-set validation 外部集验证
cate_pre = GB.predict(eesm_fp[fe_tra.columns])
cate_score = GB.predict_proba(eesm_fp[fe_tra.columns])[:, 1]  # Categories possibility 样本的类别预测概率

cate_ext = [0,0,0,0,1,0,0,0,0,1,0,0,0,0,0]

print("True Label", cate_ext)
print("Pred Label", list(cate_pre))
print("Label Possibility", list(map(lambda i: round(i, 2), cate_score)))
print("ACC:{:.3f}, F1 score:{:.3f}, AUC:{:.3f}".format(accuracy_score(cate_ext, cate_pre), f1_score(cate_ext, cate_pre, average = "weighted"),
                                                       roc_auc_score(cate_ext, cate_score, average = "macro")))

cm_e = confusion_matrix(cate_ext, cate_pre)  # Confusion matrix 混淆矩阵
print(cm_e)

#### External-set Validation ROC 外部集ROC曲线

In [None]:
plt.figure(figsize = (15, 15))
ax = plt.gca()
# ax.plot([0, 1], [0, 1], linestyle = "--", color = "black", label = "Chance level")
GB_ROC = RocCurveDisplay.from_estimator(GB, eesm_fp[fe_tra.columns], cate_ext, color = "blue",ax = ax,
                                        lw = 8, plot_chance_level = False)
ax.grid()
plt.legend(loc = "lower right")  # show legend and location setting 显示图例并控制位置
ticks = [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
ax.set_xlabel("False Positive Rate", labelpad = 15, fontsize = 50, fontweight = "bold") # Distance of line and caption 拉开轴标题与刻度线距离
ax.set_ylabel("True Positive Rate", labelpad = 15, fontsize = 50, fontweight = "bold")
ax.set_xticks(ticks, labels = ticks, fontsize = 50, fontweight = "bold") # Scale setting 设置刻度
ax.set_yticks(ticks, labels = ticks, fontsize = 50, fontweight = "bold")
plt.setp(ax.spines.values(), linewidth = 4, color = "black") # Line thickness setting 设置边框粗细
plt.plot([0, 1], [0, 1], linestyle = "--", color = "black", alpha = 0.8, lw = 4) # Diagonal drawing 绘制对角线
plt.tick_params(axis = "both", length = 10, width = 4)  # Scale thickness and length setting 设置刻度的粗细和长度
# plt.savefig("External ROC.png", dpi = 300)
plt.show()

#### SHAP Interpretation SHAP解释

In [None]:
shap.initjs() # SHAP motivation 启动解释器
explainer = shap.Explainer(GB.predict, fe_tra)  # Interpreter setting 设置解释器
shap_values = explainer(fe_tra)

# 1.Feature importance ranking 特征重要性排序
plt.grid(False)
shap.summary_plot(shap_values, fe_tra, plot_type = "bar", show = False)  # show = False for pic saving
# plt.savefig("summary_plot.png", bbox_inches="tight", dpi = 500) # DPI setting 设置分辨率
# MACCS108：CH3AAACH2A # SubFP32：Secondary mixed amine
# SubFPC295：C ONS bond # Pubchem466：N#C-C=C 

In [None]:
# 2.Feature bee-warm ranking 特征蜂群图
shap.plots.beeswarm(shap_values, max_display = 11, show = False)
plt.grid()
# plt.savefig("beeswarm.png", bbox_inches="tight", dpi = 500)