In [None]:
import pandas as pd
import numpy as np
import copy
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rcParams["font.sans-serif"] = ["Times New Roman"] # Change font as Times New Roman

from sklearn.model_selection import cross_val_score, train_test_split, StratifiedKFold
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, auc, RocCurveDisplay

from sklearn.ensemble import RandomForestClassifier

from rdkit import Chem
from rdkit.Chem import MolFromSmiles,MolFromSmarts
from collections import defaultdict
import seaborn as sns 

#### Data Input

In [None]:
path = "../Data/131 Exhaustion to PLA.xlsx"
nams = pd.read_excel(path, index_col = 0).index.values

mac_fp = pd.read_csv("../Data/model_maccs.csv").iloc[:,1:]
pub_fp = pd.read_csv("../Data/model_pubchem.csv").iloc[:,1:]
sub_fp = pd.read_csv("../Data/model_substructure.csv").iloc[:,1:]
suc_fp = pd.read_csv("../Data/model_substructure count.csv").iloc[:,1:]
est_fp = pd.read_csv("../Data/model_estate.csv").iloc[:,1:]

sub_fp.index = est_fp.index = mac_fp.index = pub_fp.index = suc_fp.index = nams
esm_fp = pd.concat([mac_fp, pub_fp, sub_fp, suc_fp, est_fp], axis = 1) 

exhs = pd.read_excel(path, index_col = 0)["Exhaustion"]
cate = [int(score//80) for score in exhs]

#### Preprossing

In [None]:
var = VarianceThreshold(threshold = 0)
esm_fp2 = var.fit_transform(esm_fp)
esm_fp2 = esm_fp.loc[:, var.get_support()]

corr_matrix = esm_fp2.corr(numeric_only = True).abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k = 1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]
esm_fp3 = esm_fp2.drop(to_drop, axis = 1)

In [None]:
SKFold = StratifiedKFold(n_splits = 10, shuffle = True, random_state = 42) 
criterion = {"accuracy":"ACC", "f1_weighted":"F1", "roc_auc":"AUROC"}
fp_tra, fp_tes, cate_tra, cate_tes = train_test_split(esm_fp3, cate, train_size = 0.8, random_state = 41, stratify = cate)

#### 10-fold Stratified Cross-validation

In [None]:
X = fp_tra.astype("float") 
Y = np.array(cate_tra).ravel()

predictions = defaultdict(list) 
RF = RandomForestClassifier(random_state = 42)

for train_index, test_index in SKFold.split(X, Y):  
    X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]  
    Y_train, Y_test = Y[train_index], Y[test_index] 
    
    RF.fit(X_train, Y_train)  
    Y_pred = RF.predict(X_test)  
    
    predictions['pred'].extend(Y_pred)
    predictions['test'].extend(Y_test) 

#### Confusion Matrix

In [None]:
cm_10 = confusion_matrix(predictions["test"], predictions["pred"])

sns.set_theme (style = "whitegrid",font = 'Times New Roman', font_scale = 2)
plt.figure()  
sns.heatmap(cm_10, annot = True, fmt = ".2g", cmap = "viridis", linewidths = 3, vmin = 0, vmax = 60, linecolor = "white", square = True)
plt.xlabel("Predict", fontsize = 20)  
plt.ylabel("True", fontsize = 20)  
plt.show()

#### Detailed Performance

In [None]:
performance = []
for scoring in criterion:
    answer = cross_val_score(RF, fp_tra, np.array(cate_tra).ravel(), cv = SKFold, scoring = scoring)
    performance.append(answer.mean().round(3))

print(performance)

#### ROC Curve

In [None]:
tprs, aucs = [], []
mean_fpr = np.linspace(0, 1, 100)

fig, ax = plt.subplots(figsize = (15, 15))
fig.bwith = 10
for i, (train_index, test_index) in enumerate(SKFold.split(X, Y)):
    X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]  
    Y_train, Y_test = Y[train_index], Y[test_index] 
    
    RF.fit(X_train, Y_train)
    viz = RocCurveDisplay.from_estimator(RF, X_test, Y_test, name = "ROC fold{}".format(i), alpha = 0.5,
                                         lw = 4, ax = ax)
    interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
    interp_tpr[0] = 0.0
    tprs.append(interp_tpr)
    aucs.append(viz.roc_auc)

mean_tpr = np.mean(tprs, axis = 0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)

ax.plot(mean_fpr, mean_tpr, color = "blue", label = r"Mean ROC(AUC = %0.2f ± %0.2f)" % (mean_auc, std_auc),
        lw = 8, alpha = 0.9)

std_tpr = np.std(tprs, axis = 0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color = "grey", alpha = .2, label = "± 1 std. dev.")

ax.set(xlim = [-0.05, 1.05], ylim = [-0.05, 1.05])
ax.axis("square")
ax.grid()
ax.legend(loc = "lower right")
ticks = [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
ax.set_xlabel("False Positive Rate", labelpad = 15, fontsize = 50, fontweight = "bold")
ax.set_ylabel("True Positive Rate", labelpad = 15, fontsize = 50, fontweight = "bold")
ax.set_xticks(ticks, labels = ticks, fontsize = 50, fontweight = "bold")
ax.set_yticks(ticks, labels = ticks, fontsize = 50, fontweight = "bold")
plt.setp(ax.spines.values(), linewidth = 4, color = "black")
plt.plot([0, 1], [0, 1], linestyle = "--", color = "black", alpha = 0.8, lw = 4)
plt.tick_params(axis = "both", length = 10, width = 4)
# plt.savefig("GB ROC.png", dpi = 300)

plt.show()