# 基于Scikit-Learn API的 Classification模板

>20221110sym新建，从原来Jacs Cu体系的程序中借鉴，综合多种分类模型看特征重要性和全集表现

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from multiprocessing import Pool
import time
c_time = time.strftime("%Y%m%d_%H%M%S", time.localtime())
c_time_m = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())

In [2]:
# 参数
# ======== System Setup ========
Version = 'Classification-Multimodel_-sym'
EPOCH = 16
CORE_NUM = 16       # 总运行轮数=EPOCH/CORE_NUM
# ======== Fit Data Input ========
S_N = 237
F_N = 54
INPUT_X = 'Features_'+str(S_N)+'_'+str(F_N)+'.csv'
INPUT_Y = 'Values_True_'+str(S_N)+'.csv'
INPUT_TITLE = 'Title_'+str(F_N)+'.csv'
INPUT_Xtest='predFull_8_56.csv'

BEGIN_INDEX = 0
END_INDEX_PLUS_ONE = None   # 设为None或者调成0和特征数量，即可使用全部特征
INPUT_SMILES = 'Smiles_'+str(S_N)+'.csv'
TITLE_DATE = '220322'
INPUT_MFF = 'Title_MFF-'+TITLE_DATE+'.csv'
INPUT_DESC = 'Title_Desc-'+TITLE_DATE+'.csv'
INPUT_CONJU = 'Title_Conju-'+TITLE_DATE+'.csv'
# ======== Find Split Settings ========
INPUT_SPLIT = 'R2_0.7515_XGB-Split.csv'
FIND_SPLIT = True
# CAL_MAE_LOOP = False
SAVE_MODEL = True
# ======== Other Fitting Settings ========
TRAIN_TEST_SPLIT = 0.85
SORT_SAMPLE = False
TEST_SPLIT_OOB = False
R2_HIGHER_LIMIT = 0.70
CONFIDENCE = 0.95
SAVE_RESULTS_OF_EVERY_ROUND = False
# ======== Data Output ========
SAVE_NAME = 'XGBoostClassification_'+c_time+'.png'
SUPTITLE = 'XGBoost on '+INPUT_X+' and '+INPUT_Y+'\nEPOCH:'+str(EPOCH)+'\n'

In [3]:
X = np.loadtxt(INPUT_X, delimiter=',')[:, BEGIN_INDEX:END_INDEX_PLUS_ONE]
title = np.loadtxt(INPUT_TITLE, dtype='str', delimiter=',', comments='!')[BEGIN_INDEX:END_INDEX_PLUS_ONE, ]
y = np.loadtxt(INPUT_Y)
print('X:', X.shape, '   y:', y.shape)
headers=title
y=y>=5
if INPUT_Xtest!=None:
    predtest = np.loadtxt(INPUT_Xtest, delimiter=',')[:, BEGIN_INDEX:END_INDEX_PLUS_ONE]
    # gbct.predict(test)用这个句式即可预测

X: (237, 54)    y: (237,)


In [4]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.metrics import RocCurveDisplay
from sklearn.metrics import auc
import pandas as pd
from sklearn import model_selection
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
import joblib
import matplotlib.pyplot as plt
import numpy as np
from sklearn import metrics

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.15, random_state=3)
cv = StratifiedShuffleSplit(n_splits=10, test_size=0.15)

In [6]:
import os
from pathlib import Path
DIR = 'Classification-Multimodel_'+Version+str(X.shape[1])+'_Fs_'+c_time
os.mkdir(DIR)

In [7]:
LOG_NAME = 'Classification-Multimodel_votingclassifier_Log_'+c_time+'.txt'
LOG_NAME = Path('.', DIR, LOG_NAME)
f1 = open(LOG_NAME, 'w+')
f1.write('XGBoost Regressor\n\n')
f1.write('Total Epoch: '+str(EPOCH)+'\n\n')
f1.write('Dataset: '+INPUT_X+', '+INPUT_Y+'\n')
f1.write('Data Shape: '+str(X.shape)+', '+str(y.shape)+'\n\n')


31

In [None]:
# 搭建决策树
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
tuned_parameters = [{'min_impurity_decrease': [0.01, 0.02, 0.03, 0.04], 'max_depth': [3, 4, 5, 6, 7, 8], 
                     'max_features': ['sqrt', 'log2'], 'max_leaf_nodes': [3,4,5,6,7,8,9], 'class_weight': ['balanced'], 
                      'min_samples_split': [3,4,5]}]
dtc = DecisionTreeClassifier()
clf = GridSearchCV(dtc, tuned_parameters, verbose=1, scoring='roc_auc', cv=cv, n_jobs=CORE_NUM)
clf.fit(X_train, y_train)
clf_new = clf.best_estimator_
best_p = clf.best_params_
paras = clf_new.get_params()
print(best_p)
acc_unique = clf_new.score(X_test, y_test)
print('Current accuracy:', acc_unique)
clf=clf_new
f1.write('Decision Tree optimized paras:\n'+str(best_p)+'\n')
if INPUT_Xtest!=None:
    predres1=clf.predict(predtest)
    print(predres1)
    f1.write('Pred_full result is:'+str(predres1)+'\n')

Fitting 10 folds for each of 1008 candidates, totalling 10080 fits


In [None]:
p = np.argsort(-clf.feature_importances_)
feat=[]
importance=[]
for i in range(3):
    print('feature_name:', title[p[i]], 'importance: ', clf.feature_importances_[p[i]])
    feat.append(title[p[i]])
    importance.append(clf.feature_importances_[p[i]])
feat=np.array(feat)
importance=np.array(importance)
# Plot the feature importances of the forest
tree_feature_importances = importance
sorted_idx = tree_feature_importances.argsort()
y_ticks = np.arange(0, len(feat))

#设置输出的图片大小
figsize = 12,6
fig, ax = plt.subplots(figsize=figsize)
ax.barh(feat, tree_feature_importances[sorted_idx])
plt.title("Decision Tree model feature importance",fontname="Times New Roman", fontsize=25)
plt.xlabel('Feature importance', fontdict={"family": "Times New Roman", "size": 25})
plt.ylabel('Feature names', fontdict={"family": "Times New Roman", "size": 25})
plt.xticks(fontname="Times New Roman", fontsize=25)
plt.yticks(fontname="Times New Roman", fontsize=25)
fig.tight_layout()
plot_tree_name_2 = 'Decision Tree Model Feature importance_'+'_acc_'+str(round(acc_unique,2))+'.png'
plot_tree_name_2 = Path('.', DIR, plot_tree_name_2)
plt.savefig(plot_tree_name_2, bbox_inches='tight',dpi=200)
plt.show()

In [None]:
str_array=",".join(map(str, importance[sorted_idx[:5]]))
str_array1=",".join(map(str, np.array(headers)[sorted_idx]))
TXT_NAME= 'Decision Tree Model_'+'Feature importance'+'.txt'
with open(TXT_NAME, 'w',encoding='utf-8') as w:
    w.write(str_array1)
    w.write(str_array+'\n')
    w.close()

In [None]:
from sklearn.metrics import plot_confusion_matrix
plot_confusion_matrix(clf, X_test, y_test)  # doctest: +SKIP
modelname='Decision Tree'
plt.title(modelname)
PLOT_NAME2 = 'confusion_matrix_'+modelname+'.png'
PLOT_NAME2 = Path('.', DIR, PLOT_NAME2)
plt.savefig(PLOT_NAME2, bbox_inches='tight',dpi=150)
plt.show()  # doctest: +SKIP

In [None]:
# Run classifier with cross-validation and plot ROC curves

classifier = clf

tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)

# plt.figure(figsize=(16, 16), dpi=250)
fig, ax = plt.subplots(figsize=(6, 6), dpi=250)
for i, (train, test) in enumerate(cv.split(X, y)):
    classifier.fit(X[train], y[train])
    viz = RocCurveDisplay.from_estimator(
        classifier,
        X[test],
        y[test],
        name="ROC fold {}".format(i),
        alpha=0.3,
        lw=1,
        ax=ax,
    )
    interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
    interp_tpr[0] = 0.0
    tprs.append(interp_tpr)
    aucs.append(viz.roc_auc)


ax.plot([0, 1], [0, 1], linestyle="--", lw=2, color="r", label="Chance", alpha=0.8)

mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
ax.plot(
    mean_fpr,
    mean_tpr,
    color="b",
    label=r"Mean ROC (AUC = %0.2f $\pm$ %0.2f)" % (mean_auc, std_auc),
    lw=2,
    alpha=0.8,
)

std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
ax.fill_between(
    mean_fpr,
    tprs_lower,
    tprs_upper,
    color="grey",
    alpha=0.2,
    label=r"$\pm$ 1 std. dev.",
)

ax.set(
    xlim=[-0.05, 1.05],
    ylim=[-0.05, 1.05]

)
plt.title("Decision Tree ",fontname="Times New Roman", fontsize=25)
plt.xlabel('False positive rate', fontdict={"family": "Times New Roman", "size": 25})
plt.ylabel('True positive rate', fontdict={"family": "Times New Roman", "size": 25})
plt.xticks(fontname="Times New Roman", fontsize=25)
plt.yticks(fontname="Times New Roman", fontsize=25)
ax.legend(loc="lower right")


score1=cross_validate(classifier, X, y, cv=cv, scoring=('accuracy','f1'))

print(score1)
PLOT_NAME6 = 'Receiver operating characteristic_'+'Decision Tree Model_'+'_AUC_'+str(round(np.mean(mean_auc),3))+'_acc_'+str(round(np.mean(score1['test_accuracy']),3))+'_f1_'+str(round(np.mean(score1['test_f1']),3))+'.png'
PLOT_NAME6 = Path('.', DIR, PLOT_NAME6)
plt.savefig(PLOT_NAME6, dpi=250, bbox_inches='tight')
plt.show()

In [None]:
len(X[train])

In [None]:
# 搭建随机森林模型
import sklearn.ensemble
tuned_parameters = [{'min_impurity_decrease': [0.0], 'max_depth': [4,5,6,7,8,9], 
                     'max_features': ['auto'], 'max_leaf_nodes': [4, 5,6,7,8],'criterion':["gini" ,"entropy"]}]
rf = sklearn.ensemble.RandomForestClassifier()
rf = GridSearchCV(rf, tuned_parameters, verbose=1,  scoring='roc_auc', cv=cv, n_jobs=CORE_NUM)
rf.fit(X_train, y_train)
rf_new = rf.best_estimator_
best_p = rf.best_params_
paras = rf_new.get_params()
print(best_p)
acc_unique = rf_new.score(X_test, y_test)
print('Current accuracy:', acc_unique)
rf=rf.best_estimator_
f1.write('Random forest optimized paras:\n'+str(best_p)+'\n')
if INPUT_Xtest!=None:
    predres1=rf.predict(predtest)
    print(predres1)
    f1.write('Pred_full result is:'+str(predres1)+'\n')

In [None]:
from sklearn.inspection import permutation_importance
tree_importance_sorted_idx = np.argsort(rf.feature_importances_)
tree_indices = np.arange(0, len(rf.feature_importances_)) + 0.5

fig, (ax1) = plt.subplots(figsize=(12, 6))
ax1.barh(tree_indices[len(headers)-10:],
         rf.feature_importances_[tree_importance_sorted_idx[len(headers)-10:]], height=0.7)
ax1.set_yticklabels(np.array(headers)[tree_importance_sorted_idx])
ax1.set_yticks(tree_indices)
ax1.set_ylim((len(headers)-10,len(headers)))
plt.title("Random Forest model feature importance",fontname="Times New Roman", fontsize=25)
plt.xlabel('Feature importance', fontdict={"family": "Times New Roman", "size": 25})
plt.ylabel('Feature names', fontdict={"family": "Times New Roman", "size": 25})
plt.xticks(fontname="Times New Roman", fontsize=25)
plt.yticks(fontname="Times New Roman", fontsize=25)
fig.tight_layout()
plot_tree_name_2 = 'Random Forest Model Feature importance_'+'_acc_'+str(round(acc_unique,2))+'.png'
plot_tree_name_2 = Path('.', DIR, plot_tree_name_2)
plt.savefig(plot_tree_name_2, bbox_inches='tight',dpi=200)
plt.show()

In [None]:
from sklearn.metrics import plot_confusion_matrix
plot_confusion_matrix(rf, X_test, y_test)  # doctest: +SKIP
modelname='Random Forest'
plt.title(modelname)
PLOT_NAME2 = 'confusion_matrix_'+modelname+'.png'
PLOT_NAME2 = Path('.', DIR, PLOT_NAME2)
plt.savefig(PLOT_NAME2, bbox_inches='tight',dpi=150)
plt.show()  # doctest: +SKIP

In [None]:
# Run classifier with cross-validation and plot ROC curves

classifier = rf

tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)

# plt.figure(figsize=(16, 16), dpi=250)
fig, ax = plt.subplots(figsize=(6, 6), dpi=250)
for i, (train, test) in enumerate(cv.split(X, y)):
    classifier.fit(X[train], y[train])
    viz = RocCurveDisplay.from_estimator(
        classifier,
        X[test],
        y[test],
        name="ROC fold {}".format(i),
        alpha=0.3,
        lw=1,
        ax=ax,
    )
    interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
    interp_tpr[0] = 0.0
    tprs.append(interp_tpr)
    aucs.append(viz.roc_auc)


ax.plot([0, 1], [0, 1], linestyle="--", lw=2, color="r", label="Chance", alpha=0.8)

mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
ax.plot(
    mean_fpr,
    mean_tpr,
    color="b",
    label=r"Mean ROC (AUC = %0.2f $\pm$ %0.2f)" % (mean_auc, std_auc),
    lw=2,
    alpha=0.8,
)

std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
ax.fill_between(
    mean_fpr,
    tprs_lower,
    tprs_upper,
    color="grey",
    alpha=0.2,
    label=r"$\pm$ 1 std. dev.",
)

ax.set(
    xlim=[-0.05, 1.05],
    ylim=[-0.05, 1.05]

)
plt.title("Random Forest",fontname="Times New Roman", fontsize=25)
plt.xlabel('False positive rate', fontdict={"family": "Times New Roman", "size": 25})
plt.ylabel('True positive rate', fontdict={"family": "Times New Roman", "size": 25})
plt.xticks(fontname="Times New Roman", fontsize=25)
plt.yticks(fontname="Times New Roman", fontsize=25)
ax.legend(loc="lower right")

score1=cross_validate(classifier, X, y, cv=cv, scoring=('accuracy','f1'))

print(score1)
PLOT_NAME6 = 'Receiver operating characteristic_'+'Random Forset Model_'+'_AUC_'+str(round(np.mean(mean_auc),3))+'_acc_'+str(round(np.mean(score1['test_accuracy']),3))+'_f1_'+str(round(np.mean(score1['test_f1']),3))+'.png'
PLOT_NAME6 = Path('.', DIR, PLOT_NAME6)
plt.savefig(PLOT_NAME6, dpi=250, bbox_inches='tight')
plt.show()

In [None]:
#GBCT model
from sklearn.ensemble import GradientBoostingClassifier
tuned_parameters = [{'learning_rate': [0.0025,0.01,0.05,0.1], 'subsample': [0.5], 'min_impurity_decrease': [0.3,0.5],
                     'max_depth': [3, 5,10], 'warm_start': [False], 'max_features': ['sqrt'], 'max_leaf_nodes': [4, 6,10],'n_estimators':[10,50,500]}]
gbct= GradientBoostingClassifier(verbose=0, validation_fraction=0.15, n_iter_no_change=50, tol=0.0001)
gbct = GridSearchCV(gbct, tuned_parameters, verbose=1,  scoring='roc_auc', cv=cv, n_jobs=CORE_NUM)
gbct.fit(X_train, y_train)
gbct_new = gbct.best_estimator_
best_p = gbct.best_params_
paras = gbct_new.get_params()
print(best_p)
acc_unique = gbct_new.score(X_test, y_test)
print('Current accuracy:', acc_unique)
gbct=gbct.best_estimator_
print(gbct.feature_importances_)
f1.write('GBCT optimized paras:\n'+str(best_p)+'\n')
if INPUT_Xtest!=None:
    predres1=gbct.predict(predtest)
    print(predres1)
    f1.write('Pred_full result is:'+str(predres1)+'\n')

In [None]:
# gbct.predict(test)

In [None]:
tree_importance_sorted_idx = np.argsort(gbct.feature_importances_)
tree_indices = np.arange(0, len(gbct.feature_importances_)) + 0.5

fig, (ax1) = plt.subplots(figsize=(12, 6))
ax1.barh(tree_indices[len(headers)-6:],
         gbct.feature_importances_[tree_importance_sorted_idx[len(headers)-6:]], height=0.7)
ax1.set_yticklabels(np.array(headers)[tree_importance_sorted_idx])
ax1.set_yticks(tree_indices)
ax1.set_ylim((len(headers)-6,len(headers)))
plt.title("GradientBoostingClassifier model feature importance",fontname="Times New Roman", fontsize=25)
plt.xlabel('Feature importance', fontdict={"family": "Times New Roman", "size": 25})
plt.ylabel('Feature names', fontdict={"family": "Times New Roman", "size": 25})
plt.xticks(fontname="Times New Roman", fontsize=25)
plt.yticks(fontname="Times New Roman", fontsize=25)
fig.tight_layout()
plot_tree_name_2 = 'GradientBoostingClassifier Model Feature importance_'+'_acc_'+str(round(acc_unique,2))+'.png'
plot_tree_name_2 = Path('.', DIR, plot_tree_name_2)
plt.savefig(plot_tree_name_2, bbox_inches='tight',dpi=200)
plt.show()

In [None]:
len(headers)

In [None]:
from sklearn.metrics import plot_confusion_matrix
plot_confusion_matrix(gbct, X_test, y_test)  # doctest: +SKIP

modelname='GradientBoostingClassifier'
plt.title(modelname)
PLOT_NAME2 = 'confusion_matrix_'+modelname+'.png'
PLOT_NAME2 = Path('.', DIR, PLOT_NAME2)
plt.savefig(PLOT_NAME2, bbox_inches='tight',dpi=150)
plt.show()  # doctest: +SKIP

In [None]:
import shap
# X_train, X_test, y_train, y_test = train_test_split(
#     X, y, test_size=0.15, random_state=19)
explainer = shap.TreeExplainer(gbct)
shap_values = explainer.shap_values(X)
shap.summary_plot(shap_values, X,feature_names=title)
shap_values
save_name = 'SHAP_Matrix_GBCT_'+c_time+'.csv'
save_name = Path('.', DIR, save_name)
np.savetxt(save_name, shap_values, fmt='%s', delimiter=',')

for i in range(len(title)):    
    plt.scatter(X[:,i], shap_values[:,i])
    plt.title(title[i])
    plt.show()

In [None]:
# Run classifier with cross-validation and plot ROC curves


classifier = gbct

tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)

# plt.figure(figsize=(16, 16), dpi=250)
fig, ax = plt.subplots(figsize=(6, 6), dpi=250)
for i, (train, test) in enumerate(cv.split(X, y)):
    classifier.fit(X[train], y[train])
    viz = RocCurveDisplay.from_estimator(
        classifier,
        X[test],
        y[test],
        name="ROC fold {}".format(i),
        alpha=0.3,
        lw=1,
        ax=ax,
    )
    interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
    interp_tpr[0] = 0.0
    tprs.append(interp_tpr)
    aucs.append(viz.roc_auc)


ax.plot([0, 1], [0, 1], linestyle="--", lw=2, color="r", label="Chance", alpha=0.8)

mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
ax.plot(
    mean_fpr,
    mean_tpr,
    color="b",
    label=r"Mean ROC (AUC = %0.2f $\pm$ %0.2f)" % (mean_auc, std_auc),
    lw=2,
    alpha=0.8,
)

std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
ax.fill_between(
    mean_fpr,
    tprs_lower,
    tprs_upper,
    color="grey",
    alpha=0.2,
    label=r"$\pm$ 1 std. dev.",
)

ax.set(
    xlim=[-0.05, 1.05],
    ylim=[-0.05, 1.05]

)
plt.title("GradientBoostingClassifier",fontname="Times New Roman", fontsize=25)
plt.xlabel('False positive rate', fontdict={"family": "Times New Roman", "size": 25})
plt.ylabel('True positive rate', fontdict={"family": "Times New Roman", "size": 25})
plt.xticks(fontname="Times New Roman", fontsize=25)
plt.yticks(fontname="Times New Roman", fontsize=25)
ax.legend(loc="lower right")

score1=cross_validate(classifier, X, y, cv=cv, scoring=('accuracy','f1'))

print(score1)
PLOT_NAME6 = 'Receiver operating characteristic_'+'GradientBoostingClassifier Model_'+'_AUC_'+str(round(np.mean(mean_auc),3))+'_acc_'+str(round(np.mean(score1['test_accuracy']),3))+'_f1_'+str(round(np.mean(score1['test_f1']),3))+'.png';
PLOT_NAME6 = Path('.', DIR, PLOT_NAME6)
plt.savefig(PLOT_NAME6, dpi=250, bbox_inches='tight')
plt.show()

In [None]:
# 逻辑斯蒂
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(class_weight='balanced', verbose=0, n_jobs=-1)

lr.fit(X_train, y_train)
coef = np.array(lr.coef_).flatten()
#Linear regression
import numpy as np
from sklearn import datasets, linear_model

y_pred = lr.predict(X_test)
# The coefficients
print('Coefficients: \n', lr.coef_)


# 训练后模型截距
print('模型截距:')
print(lr.intercept_)
coefs = pd.DataFrame(
   coef,
    columns=['Coefficients'], index=headers
)
coefs.plot(kind='barh', figsize=(9, 7))
plt.title('Logistic Regression classifier')
plt.axvline(x=0, color='.5')
plt.subplots_adjust(left=.3)
f1.write('Logistic Regression paras:\n'+str( lr.coef_)+'\n')
if INPUT_Xtest!=None:
    predres1=lr.predict(predtest)
    print(predres1)
    f1.write('Pred_full result is:'+str(predres1)+'\n')

In [None]:
#coefs_copy=coefs.iloc[[idx for idx in range(len(coefs)) if coefs.iloc[idx].values>5e12 or coefs.iloc[idx].values <-5e12]]
bound=np.sort(lr.coef_)
coefs_copy=coefs.iloc[[idx for idx in range(len(coefs)) if coefs.iloc[idx].values>np.abs(bound[0][2]) or coefs.iloc[idx].values <-np.abs(bound[0][2])]]
coefs_copy.plot(kind='barh', figsize=(9, 8))
import matplotlib
plt.title('Logistic Regression classifier')
plt.axvline(x=0, color='.5')

plt.title("LogisticRegression model feature importance",fontname="Times New Roman", fontsize=25)
plt.xlabel('Feature importance', fontdict={"family": "Times New Roman", "size": 25})
plt.ylabel('Feature names', fontdict={"family": "Times New Roman", "size": 25})
plt.xticks(fontname="Times New Roman", fontsize=25)
plt.yticks(fontname="Times New Roman", fontsize=25)
fig.tight_layout()
plot_tree_name_2 = 'LogisticRegression Model Feature importance_'+'_acc_'+str(round(acc_unique,2))+'.png'
plot_tree_name_2 = Path('.', DIR, plot_tree_name_2)

plt.subplots_adjust(left=.3)
plt.savefig(plot_tree_name_2, bbox_inches='tight',dpi=200)

In [None]:
from sklearn.metrics import plot_confusion_matrix
plot_confusion_matrix(lr, X_test, y_test)  # doctest: +SKIP
modelname='LogisticRegression'
plt.title(modelname)
PLOT_NAME2 = 'confusion_matrix_'+modelname+'.png'
PLOT_NAME2 = Path('.', DIR, PLOT_NAME2)
plt.savefig(PLOT_NAME2, bbox_inches='tight',dpi=150)
plt.show()  # doctest: +SKIP

In [None]:
# Run classifier with cross-validation and plot ROC curves

classifier = lr

tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)

# plt.figure(figsize=(16, 16), dpi=250)
fig, ax = plt.subplots(figsize=(6, 6), dpi=250)
for i, (train, test) in enumerate(cv.split(X, y)):
    classifier.fit(X[train], y[train])
    viz = RocCurveDisplay.from_estimator(
        classifier,
        X[test],
        y[test],
        name="ROC fold {}".format(i),
        alpha=0.3,
        lw=1,
        ax=ax,
    )
    interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
    interp_tpr[0] = 0.0
    tprs.append(interp_tpr)
    aucs.append(viz.roc_auc)


ax.plot([0, 1], [0, 1], linestyle="--", lw=2, color="r", label="Chance", alpha=0.8)

mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
ax.plot(
    mean_fpr,
    mean_tpr,
    color="b",
    label=r"Mean ROC (AUC = %0.2f $\pm$ %0.2f)" % (mean_auc, std_auc),
    lw=2,
    alpha=0.8,
)

std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
ax.fill_between(
    mean_fpr,
    tprs_lower,
    tprs_upper,
    color="grey",
    alpha=0.2,
    label=r"$\pm$ 1 std. dev.",
)

ax.set(
    xlim=[-0.05, 1.05],
    ylim=[-0.05, 1.05]

)
plt.title("Logistic Regreesion",fontname="Times New Roman", fontsize=25)
plt.xlabel('False positive rate', fontdict={"family": "Times New Roman", "size": 25})
plt.ylabel('True positive rate', fontdict={"family": "Times New Roman", "size": 25})
plt.xticks(fontname="Times New Roman", fontsize=25)
plt.yticks(fontname="Times New Roman", fontsize=25)
ax.legend(loc="lower right")

score1=cross_validate(classifier, X, y, cv=cv, scoring=('accuracy','f1'))

print(score1)
PLOT_NAME6 = 'Receiver operating characteristic_'+'LogisticRegression Model_'+'_AUC_'+str(round(np.mean(mean_auc),3))+'_acc_'+str(round(np.mean(score1['test_accuracy']),3))+'_f1_'+str(round(np.mean(score1['test_f1']),3))+'.png'
PLOT_NAME6 = Path('.', DIR, PLOT_NAME6)
plt.savefig(PLOT_NAME6, dpi=250, bbox_inches='tight')
plt.show()

In [None]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
parameters= [{'learning_rate':[0.01,0.1],'n_estimators':[50,100,500],'max_depth':[3,5,10]}]
clf = GridSearchCV(XGBClassifier(            
             min_child_weight=1,
             gamma=0.5,
             subsample=0.6,
             colsample_bytree=0.6,
             objective= 'binary:logistic', #逻辑回归损失函数
             scale_pos_weight=1,
             reg_alpha=0,
             reg_lambda=1, n_jobs=CORE_NUM
            ),
            param_grid=parameters,scoring='roc_auc') 
clf.fit(X_train, y_train)
print(clf.best_params_) 
y_pre= clf.predict(X_test)
y_pro= clf.predict_proba(X_test)[:,1]
print ("AUC Score : %f" % metrics.roc_auc_score(y_test, y_pro))
print("Accuracy : %.4g" % metrics.accuracy_score(y_test, y_pre))   
xgb=clf.best_estimator_
print('xgb parameter = ',xgb)
f1.write('XGB optimized paras:\n'+str(xgb)+'\n')
if INPUT_Xtest!=None:
    predres1=xgb.predict(predtest)
    print(predres1)
    f1.write('Pred_full result is:'+str(predres1)+'\n')

In [None]:
from sklearn.inspection import permutation_importance
tree_importance_sorted_idx = np.argsort(xgb.feature_importances_)
tree_indices = np.arange(0, len(xgb.feature_importances_)) + 0.5

fig, (ax1) = plt.subplots(figsize=(12, 6))
ax1.barh(tree_indices[len(headers)-10:],
         xgb.feature_importances_[tree_importance_sorted_idx[len(headers)-10:]], height=0.7)
ax1.set_yticklabels(np.array(headers)[tree_importance_sorted_idx])
ax1.set_yticks(tree_indices)
ax1.set_ylim((len(headers)-10,len(headers)))
plt.title("XGBoost CLassifier model feature importance",fontname="Times New Roman", fontsize=25)
plt.xlabel('Feature importance', fontdict={"family": "Times New Roman", "size": 25})
plt.ylabel('Feature names', fontdict={"family": "Times New Roman", "size": 25})
plt.xticks(fontname="Times New Roman", fontsize=25)
plt.yticks(fontname="Times New Roman", fontsize=25)
fig.tight_layout()
plot_tree_name_2 = 'XGBoost Classifier Model Feature importance_'+'_acc_'+str(round(acc_unique,2))+'.png'
plot_tree_name_2 = Path('.', DIR, plot_tree_name_2)
plt.savefig(plot_tree_name_2, bbox_inches='tight',dpi=200)
plt.show()

In [None]:
from sklearn.metrics import plot_confusion_matrix
plot_confusion_matrix(xgb, X_test, y_test)  # doctest: +SKIP
modelname='XGBoost Classifier'
plt.title(modelname)
PLOT_NAME2 = 'confusion_matrix_'+modelname+'.png'
PLOT_NAME2 = Path('.', DIR, PLOT_NAME2)
plt.savefig(PLOT_NAME2, bbox_inches='tight',dpi=150)
plt.show()  # doctest: +SKIP

In [None]:
# Run classifier with cross-validation and plot ROC curves

classifier = xgb

tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)

# plt.figure(figsize=(16, 16), dpi=250)
fig, ax = plt.subplots(figsize=(6, 6), dpi=250)
for i, (train, test) in enumerate(cv.split(X, y)):
    classifier.fit(X[train], y[train])
    viz = RocCurveDisplay.from_estimator(
        classifier,
        X[test],
        y[test],
        name="ROC fold {}".format(i),
        alpha=0.3,
        lw=1,
        ax=ax,
    )
    interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
    interp_tpr[0] = 0.0
    tprs.append(interp_tpr)
    aucs.append(viz.roc_auc)


ax.plot([0, 1], [0, 1], linestyle="--", lw=2, color="r", label="Chance", alpha=0.8)

mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
ax.plot(
    mean_fpr,
    mean_tpr,
    color="b",
    label=r"Mean ROC (AUC = %0.2f $\pm$ %0.2f)" % (mean_auc, std_auc),
    lw=2,
    alpha=0.8,
)

std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
ax.fill_between(
    mean_fpr,
    tprs_lower,
    tprs_upper,
    color="grey",
    alpha=0.2,
    label=r"$\pm$ 1 std. dev.",
)

ax.set(
    xlim=[-0.05, 1.05],
    ylim=[-0.05, 1.05]

)
plt.title("XGBoost Classifier",fontname="Times New Roman", fontsize=25)
plt.xlabel('False positive rate', fontdict={"family": "Times New Roman", "size": 25})
plt.ylabel('True positive rate', fontdict={"family": "Times New Roman", "size": 25})
plt.xticks(fontname="Times New Roman", fontsize=25)
plt.yticks(fontname="Times New Roman", fontsize=25)
ax.legend(loc="lower right")

score1=cross_validate(classifier, X, y, cv=cv, scoring=('accuracy','f1'))

print(score1)
PLOT_NAME6 = 'Receiver operating characteristic_'+'XGBoost CLassifier Model_'+'_AUC_'+str(round(np.mean(mean_auc),3))+'_acc_'+str(round(np.mean(score1['test_accuracy']),3))+'_f1_'+str(round(np.mean(score1['test_f1']),3))+'.png'
PLOT_NAME6 = Path('.', DIR, PLOT_NAME6)
plt.savefig(PLOT_NAME6, dpi=250, bbox_inches='tight')
plt.show()

In [None]:
# explainer = shap.Explainer(xgb, X)
# shap_values = explainer(X)
# shap.plots.bar(shap_values, max_display=12)

In [None]:
#KNeighborsClassifier model
from sklearn.neighbors import KNeighborsClassifier
tuned_parameters = [{'n_neighbors': [5,15,50], 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']}]
KNeighbors= KNeighborsClassifier()

KNeighbors= GridSearchCV(KNeighbors, tuned_parameters, verbose=1, scoring='roc_auc', cv=cv, n_jobs=CORE_NUM)
KNeighbors.fit(X_train, y_train)
KNeighbors_new = KNeighbors.best_estimator_
best_p = KNeighbors.best_params_
paras = KNeighbors_new.get_params()
print(best_p)
acc_unique = KNeighbors_new.score(X_test, y_test)
print('Current accuracy:', acc_unique)
KNeighbors=KNeighbors.best_estimator_
f1.write('KNN optimized paras:\n'+str(best_p)+'\n')
if INPUT_Xtest!=None:
    predres1=KNeighbors.predict(predtest)
    print(predres1)
    f1.write('Pred_full result is:'+str(predres1)+'\n')

In [None]:
from sklearn.metrics import plot_confusion_matrix
plot_confusion_matrix(KNeighbors, X_test, y_test)  # doctest: +SKIP
modelname='NeighborsClassifier'
plt.title(modelname)
PLOT_NAME2 = 'confusion_matrix_'+modelname+'.png'
PLOT_NAME2 = Path('.', DIR, PLOT_NAME2)
plt.savefig(PLOT_NAME2, bbox_inches='tight',dpi=150)
plt.show()  # doctest: +SKIP

In [None]:
# Run classifier with cross-validation and plot ROC curves

classifier = KNeighbors

tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)

# plt.figure(figsize=(16, 16), dpi=250)
fig, ax = plt.subplots(figsize=(6, 6), dpi=250)
for i, (train, test) in enumerate(cv.split(X, y)):
    classifier.fit(X[train], y[train])
    viz = RocCurveDisplay.from_estimator(
        classifier,
        X[test],
        y[test],
        name="ROC fold {}".format(i),
        alpha=0.3,
        lw=1,
        ax=ax,
    )
    interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
    interp_tpr[0] = 0.0
    tprs.append(interp_tpr)
    aucs.append(viz.roc_auc)


ax.plot([0, 1], [0, 1], linestyle="--", lw=2, color="r", label="Chance", alpha=0.8)

mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
ax.plot(
    mean_fpr,
    mean_tpr,
    color="b",
    label=r"Mean ROC (AUC = %0.2f $\pm$ %0.2f)" % (mean_auc, std_auc),
    lw=2,
    alpha=0.8,
)

std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
ax.fill_between(
    mean_fpr,
    tprs_lower,
    tprs_upper,
    color="grey",
    alpha=0.2,
    label=r"$\pm$ 1 std. dev.",
)

ax.set(
    xlim=[-0.05, 1.05],
    ylim=[-0.05, 1.05]

)
plt.title("Nearest Neighbors",fontname="Times New Roman", fontsize=25)
plt.xlabel('False positive rate', fontdict={"family": "Times New Roman", "size": 25})
plt.ylabel('True positive rate', fontdict={"family": "Times New Roman", "size": 25})
plt.xticks(fontname="Times New Roman", fontsize=25)
plt.yticks(fontname="Times New Roman", fontsize=25)
ax.legend(loc="lower right")

score1=cross_validate(classifier, X, y, cv=cv, scoring=('accuracy','f1'))

print(score1)
PLOT_NAME6 = 'Receiver operating characteristic_'+'KNearest Neighbors Model_'+'_AUC_'+str(round(np.mean(mean_auc),3))+'_acc_'+str(round(np.mean(score1['test_accuracy']),3))+'_f1_'+str(round(np.mean(score1['test_f1']),3))+'.png'
PLOT_NAME6 = Path('.', DIR, PLOT_NAME6)
plt.savefig(PLOT_NAME6, dpi=250, bbox_inches='tight')
plt.show()

In [None]:
# mlp
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler
tuned_parameters = [{ 'activation': ['identity', 'logistic', 'tanh', 'relu'],'solver': ['lbfgs', 'sgd', 'adam'],'learning_rate_init': [0.001,0.01,0.1]}]
X = MinMaxScaler().fit_transform(X)
mlp = MLPClassifier( hidden_layer_sizes=(5, 3))

mlp= GridSearchCV(mlp, tuned_parameters,verbose=1, scoring='roc_auc', cv=cv, n_jobs=CORE_NUM)
mlp.fit(X_train, y_train)
mlp_new = mlp.best_estimator_
best_p = mlp.best_params_
paras = mlp_new.get_params()
print(best_p)
acc_unique = mlp_new.score(X_test, y_test)
print('Current accuracy:', acc_unique)
mlp=mlp.best_estimator_
f1.write('MLP optimized paras:\n'+str(best_p)+'\n')
if INPUT_Xtest!=None:
    predres1=mlp.predict(predtest)
    print(predres1)
    f1.write('Pred_full result is:'+str(predres1)+'\n')

In [None]:
from sklearn.metrics import plot_confusion_matrix
plot_confusion_matrix(mlp, X_test, y_test)  # doctest: +SKIP
modelname='MLPClassifier'
plt.title(modelname)
PLOT_NAME2 = 'confusion_matrix_'+modelname+'.png'
PLOT_NAME2 = Path('.', DIR, PLOT_NAME2)
plt.savefig(PLOT_NAME2, bbox_inches='tight',dpi=150)
plt.show()  # doctest: +SKIP

In [None]:
# Run classifier with cross-validation and plot ROC curves

classifier = mlp

tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)

# plt.figure(figsize=(16, 16), dpi=250)
fig, ax = plt.subplots(figsize=(6, 6), dpi=250)
for i, (train, test) in enumerate(cv.split(X, y)):
    classifier.fit(X[train], y[train])
    viz = RocCurveDisplay.from_estimator(
        classifier,
        X[test],
        y[test],
        name="ROC fold {}".format(i),
        alpha=0.3,
        lw=1,
        ax=ax,
    )
    interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
    interp_tpr[0] = 0.0
    tprs.append(interp_tpr)
    aucs.append(viz.roc_auc)


ax.plot([0, 1], [0, 1], linestyle="--", lw=2, color="r", label="Chance", alpha=0.8)

mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
ax.plot(
    mean_fpr,
    mean_tpr,
    color="b",
    label=r"Mean ROC (AUC = %0.2f $\pm$ %0.2f)" % (mean_auc, std_auc),
    lw=2,
    alpha=0.8,
)

std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
ax.fill_between(
    mean_fpr,
    tprs_lower,
    tprs_upper,
    color="grey",
    alpha=0.2,
    label=r"$\pm$ 1 std. dev.",
)

ax.set(
    xlim=[-0.05, 1.05],
    ylim=[-0.05, 1.05]

)
plt.title("Multi-layer Perceptron classifier",fontname="Times New Roman", fontsize=25)
plt.xlabel('False positive rate', fontdict={"family": "Times New Roman", "size": 25})
plt.ylabel('True positive rate', fontdict={"family": "Times New Roman", "size": 25})
plt.xticks(fontname="Times New Roman", fontsize=25)
plt.yticks(fontname="Times New Roman", fontsize=25)
ax.legend(loc="lower right")

score1=cross_validate(classifier, X, y, cv=cv, scoring=('accuracy','f1'))

print(score1)
PLOT_NAME6 = 'Receiver operating characteristic_'+'Multi-layer Perceptron classifier Model_'+'_AUC_'+str(round(np.mean(mean_auc),3))+'_acc_'+str(round(np.mean(score1['test_accuracy']),3))+'_f1_'+str(round(np.mean(score1['test_f1']),3))+'.png'
PLOT_NAME6 = Path('.', DIR, PLOT_NAME6)
plt.savefig(PLOT_NAME6, dpi=250, bbox_inches='tight')
plt.show()

In [None]:
# SVM
from sklearn import svm
svc = svm.SVC()
# svc.fit(X, y)


# tuned_parameters = [{ 'kernel': ['linear']}]
# svc = svm.SVC()

# svc = GridSearchCV(svc, tuned_parameters, verbose=1, scoring='roc_auc', cv=cv)
svc.fit(X_train, y_train)
# svc_new = svc.best_estimator_
# best_p = svc.best_params_
# paras = svc_new.get_params()
# print(best_p)
# acc_unique = svc_new.score(X_test, y_test)
# print('Current accuracy:', acc_unique)
# svc=svc.best_estimator_
f1.write('svc optimized paras:\n'+str(best_p)+'\n')
if INPUT_Xtest!=None:
    predres1=svc.predict(predtest)
    print(predres1)
    f1.write('Pred_full result is:'+str(predres1)+'\n')

In [None]:
from sklearn.metrics import plot_confusion_matrix
plot_confusion_matrix(svc, X_test, y_test)  # doctest: +SKIP
modelname='SVC'
plt.title(modelname)
PLOT_NAME2 = 'confusion_matrix_'+modelname+'.png'
PLOT_NAME2 = Path('.', DIR, PLOT_NAME2)
plt.savefig(PLOT_NAME2, bbox_inches='tight',dpi=150)
plt.show()  # doctest: +SKIP

In [None]:
# Run classifier with cross-validation and plot ROC curves

classifier = svc

tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)

# plt.figure(figsize=(16, 16), dpi=250)
fig, ax = plt.subplots(figsize=(6, 6), dpi=250)
for i, (train, test) in enumerate(cv.split(X, y)):
    classifier.fit(X[train], y[train])
    viz = RocCurveDisplay.from_estimator(
        classifier,
        X[test],
        y[test],
        name="ROC fold {}".format(i),
        alpha=0.3,
        lw=1,
        ax=ax,
    )
    interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
    interp_tpr[0] = 0.0
    tprs.append(interp_tpr)
    aucs.append(viz.roc_auc)


ax.plot([0, 1], [0, 1], linestyle="--", lw=2, color="r", label="Chance", alpha=0.8)

mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
ax.plot(
    mean_fpr,
    mean_tpr,
    color="b",
    label=r"Mean ROC (AUC = %0.2f $\pm$ %0.2f)" % (mean_auc, std_auc),
    lw=2,
    alpha=0.8,
)

std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
ax.fill_between(
    mean_fpr,
    tprs_lower,
    tprs_upper,
    color="grey",
    alpha=0.2,
    label=r"$\pm$ 1 std. dev.",
)

ax.set(
    xlim=[-0.05, 1.05],
    ylim=[-0.05, 1.05]

)
plt.title("SVC",fontname="Times New Roman", fontsize=25)
plt.xlabel('False positive rate', fontdict={"family": "Times New Roman", "size": 25})
plt.ylabel('True positive rate', fontdict={"family": "Times New Roman", "size": 25})
plt.xticks(fontname="Times New Roman", fontsize=25)
plt.yticks(fontname="Times New Roman", fontsize=25)
ax.legend(loc="lower right")

score1=cross_validate(classifier, X, y, cv=cv, scoring=('accuracy','f1'))

print(score1)
PLOT_NAME6 = 'Receiver operating characteristic_'+'SVC Model_'+'_acc_'+'_AUC_'+str(round(np.mean(mean_auc),3))+'_acc_'+str(round(np.mean(score1['test_accuracy']),3))+'_f1_'+str(round(np.mean(score1['test_f1']),3))+'.png'
PLOT_NAME6 = Path('.', DIR, PLOT_NAME6)
plt.savefig(PLOT_NAME6, dpi=250, bbox_inches='tight')
plt.show()

In [None]:

f1.close()