In [None]:
import sys
print("Python version: {}". format(sys.version))
import pandas as pd
print("pandas version: {}". format(pd.__version__))
import matplotlib
print("matplotlib version: {}". format(matplotlib.__version__))
import numpy as np
print("NumPy version: {}". format(np.__version__))
import scipy as sp
print("SciPy version: {}". format(sp.__version__)) 
import sklearn
print("scikit-learn version: {}". format(sklearn.__version__))

In [2]:
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns

mpl.style.use('ggplot')
sns.set_style('white')
pylab.rcParams['figure.figsize'] = 12, 8

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [3]:
SEED = 42 
GROUP = {0:[0], 1:[2]}

In [4]:
def mymetrics(y_true, y_pred):
    TP = sum([1 if y_pred[i]==1 and y_true[i]==1 else 0 for i in range(len(y_pred))]) #True Positive
    FP = sum([1 if y_pred[i]==1 and y_true[i]==0 else 0 for i in range(len(y_pred))]) #False Positive
    TN = sum([1 if y_pred[i]==0 and y_true[i]==0 else 0 for i in range(len(y_pred))]) #True Negative
    FN = sum([1 if y_pred[i]==0 and y_true[i]==1 else 0 for i in range(len(y_pred))]) #False Negative
    TPR = TP / float(TP + FN) if TP + FN != 0 else 0.0 #True Positive Rate
    TNR = TN / float(TN + FP) if TN + FP != 0 else 0.0 #True Negative Rate
    FPR = FP / float(FP + TN) if FP + TN != 0 else 0.0 #False Positive Rate
    FNR = FN / float(FN + TP) if FN + TP != 0 else 0.0 #False Negative Rate
    Percision = TP / float(TP + FP) if TP + FP != 0 else 0.0
    Recall = TP / float(TP + FN) if TP + FN != 0 else 0.0
    return f'TPR = {round(TPR, 3)}, TNR = {round(TNR, 3)}, Percision = {round(Percision, 3)}, Recall = {round(Recall, 3)}'

In [5]:
def get_x_y(X, y, group={0:[0], 1:[2]}):
    new_X = []
    new_y = []
    for i in range(len(y)):
        if y.iloc[i] in group[0]:
            new_X.append(X.iloc[i, :])
            new_y.append(0)
        if y.iloc[i] in group[1]:
            new_X.append(X.iloc[i, :])
            new_y.append(1)
    return np.array(new_X), np.array(new_y)

In [None]:
data = pd.read_csv('path/to/file', index_col=0)
data.head(2)

In [None]:
from sklearn.preprocessing import StandardScaler

labels = data['dia_result']
variances = data.drop(columns=['dia_result']).var()
threshold = 0.01
high_variance_features = variances[variances > threshold].index
filtered_data = data.drop(columns='dia_result')
print(filtered_data)
scaler = StandardScaler()
standardized_data = scaler.fit_transform(filtered_data)
standardized_df = pd.DataFrame(standardized_data, columns=filtered_data.columns)
standardized_df['dia_result'] = labels.reset_index(drop=True)
print(standardized_df)

In [None]:
standardized_df.describe(include = 'all')

In [9]:
target = 'dia_result'

In [None]:
f'Non-VSD: {(standardized_df[target]==0).sum()}; VSD: {(standardized_df[target]==2).sum()}'

cv = model_selection.StratifiedShuffleSplit(n_splits=2, test_size=0.4, random_state=SEED) 
X = standardized_df.drop(columns=[target])
y = standardized_df[target] 

for train_index, test_index in cv.split(X, y):pass 
X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]
f'Train: {X_train.shape}, Distribution：({[(y_train==i).sum() for i in [0, 2]]}); Test: {X_test.shape}, Distribution:({[(y_test==i).sum() for i in [0, 2]]})'

In [11]:
X_train_, y_train_ = get_x_y(X_train, y_train, GROUP)
X_test_, y_test_ = get_x_y(X_test, y_test, GROUP)

In [None]:
from collections import Counter
from imblearn.over_sampling import SMOTE

print(f"Training set class distribution: {Counter(y_train_)}")
print(f"Test set class distribution: {Counter(y_test_)}")

smote = SMOTE(random_state=SEED)

X_train_smote, y_train_smote = smote.fit_resample(X_train_, y_train_)

print(pd.Series(y_train_smote).value_counts())

In [25]:
#Machine Learning Algorithm (MLA) Selection and Initialization
MLA = [
    ensemble.AdaBoostClassifier(),
    ensemble.ExtraTreesClassifier(),
    ensemble.GradientBoostingClassifier(),
    ensemble.RandomForestClassifier(),
    linear_model.LogisticRegressionCV(),
    linear_model.PassiveAggressiveClassifier(),
    linear_model.SGDClassifier(),
    linear_model.Perceptron(),
    naive_bayes.BernoulliNB(),
    naive_bayes.GaussianNB(),
    neighbors.KNeighborsClassifier(),
    svm.SVC(probability=True),
    svm.LinearSVC(),
    tree.DecisionTreeClassifier(),
    tree.ExtraTreeClassifier(),
    discriminant_analysis.LinearDiscriminantAnalysis(),
    discriminant_analysis.QuadraticDiscriminantAnalysis()
    ]

cv_split = model_selection.StratifiedShuffleSplit(n_splits=5, test_size=0.4, train_size =0.6, random_state=SEED) 

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score

MLA_columns = ['MLA Name', 'MLA Parameters', 'MLA Train AUC Mean', 'MLA Valid AUC Mean', 
               'MLA Valid AUC 3*STD', 'MLA Time', 'MLA Test AUC']
MLA_compare = pd.DataFrame(columns=MLA_columns)

row_index = 0

for alg in MLA:
    try:
        MLA_name = alg.__class__.__name__
        MLA_compare.loc[row_index, 'MLA Name'] = MLA_name
        MLA_compare.loc[row_index, 'MLA Parameters'] = str(alg.get_params())
        
        cv_results = model_selection.cross_validate(
            alg, X_train_smote, y_train_smote, cv=cv_split, scoring='roc_auc', return_train_score=True
        )
        MLA_compare.loc[row_index, 'MLA Time'] = cv_results['fit_time'].mean()
        MLA_compare.loc[row_index, 'MLA Train AUC Mean'] = cv_results['train_score'].mean()
        MLA_compare.loc[row_index, 'MLA Valid AUC Mean'] = cv_results['test_score'].mean()
        MLA_compare.loc[row_index, 'MLA Valid AUC 3*STD'] = cv_results['test_score'].std() * 3
        
        alg.fit(X_train_smote, y_train_smote)
        y_test_pred = alg.predict_proba(X_test_)[:, 1]
        test_auc = roc_auc_score(y_test_, y_test_pred)
        MLA_compare.loc[row_index, 'MLA Test AUC'] = test_auc
    except Exception as e:
        print(f"Model {alg.__class__.__name__} Error: {e}")
        MLA_compare.loc[row_index, 'MLA Test AUC'] = None
    
    finally:
        row_index += 1
        
MLA_compare.sort_values(by=['MLA Test AUC'], ascending=False, inplace=True)
print(MLA_compare)

In [None]:
MLA_compare_test = MLA_compare.dropna(subset=["MLA Test AUC"])
fig = plt.figure(figsize=(10, 6), dpi=300)
_ = sns.barplot(x='MLA Test AUC', y = 'MLA Name', data = MLA_compare_test, color = 'm') 
_ = plt.title('Machine Learning Algorithm Accuracy Score in Test Set \n')
_ = plt.xlabel('AUC')
_ = plt.ylabel('Algorithm')

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from skopt import BayesSearchCV

model = LogisticRegression(random_state=SEED, max_iter=1000)
param_space = {
    'C': (1e-3, 1e3, 'log-uniform'),
    'penalty': ['l2', 'l1'],
    'solver': ['liblinear', 'saga'],
    'class_weight': [None]
}

bayes_search = BayesSearchCV(
    estimator=model,
    search_spaces=param_space,
    scoring='roc_auc',
    n_iter=50,
    cv=5,
    random_state=SEED,
    n_jobs=-1
)

bayes_search.fit(X_train_smote, y_train_smote)

print("Best Parameters:", bayes_search.best_params_)
print("Best Score (ROC AUC):", bayes_search.best_score_)

best_model = bayes_search.best_estimator_
y_test_pred = best_model.predict_proba(X_test_)[:, 1]
print("Test ROC AUC:", roc_auc_score(y_test_, y_test_pred))

In [None]:
from sklearn.metrics import roc_auc_score
y_test_pred = best_model.predict_proba(X_test_)[:, 1]
print("Test ROC AUC:", roc_auc_score(y_test_, y_test_pred))

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(y_test_, y_test_pred, pos_label=1)
fig = plt.figure(figsize=(8, 6), dpi=300)
_ = plt.plot(fpr, tpr, color="darkorange", lw=2, label="ROC curve (area = %0.2f)" % metrics.auc(fpr, tpr)) 
_ = plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--") 
_ = plt.xlim([-0.01, 1.01])
_ = plt.ylim([-0.01, 1.01]) 
_ = plt.xlabel("False Positive Rate")
_ = plt.ylabel("True Positive Rate") 
_ = plt.legend(loc="lower right") 
_ = plt.show()

In [None]:
from sklearn.metrics import classification_report
fpr, tpr, thresholds = metrics.roc_curve(y_test_, y_test_pred, pos_label=1)
youden_index = tpr - fpr
optimal_idx = np.argmax(youden_index)
optimal_threshold = thresholds[optimal_idx]
print(f"Optimal threshold: {optimal_threshold}")
y_pred_optimal = (y_test_pred >= optimal_threshold).astype(int)
print("Classification Report:")
print(classification_report(y_test_, y_pred_optimal))
f'Test Accuracy = {best_model.score(X_test_, y_test_)}'

#### External DataSet Validation

In [18]:
data_predict = pd.read_csv('path/to/file', index_col=0)
X_predict = data_predict[X_train.columns]
y_predict = data_predict[target]
scaler = StandardScaler()
standardized_data_pd = scaler.fit_transform(X_predict)
standardized_df_pd = pd.DataFrame(standardized_data_pd, columns=X_predict.columns)
X_predict_, y_predict_ = get_x_y(standardized_df_pd, y_predict, GROUP)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
predictions = best_model.predict(X_predict_)
probs = best_model.predict_proba(X_predict_)[:, 1]
metrics_result = mymetrics(y_predict_, predictions)
print(f'External Data Metrics: {metrics_result}')
y_pred_optimal = (probs >= optimal_threshold).astype(int)
print("Classification Report:\n", classification_report(y_predict_, y_pred_optimal))
print("External Test ROC AUC:", roc_auc_score(y_predict_, probs))

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(y_predict_, probs, pos_label=1)
fig = plt.figure(figsize=(8, 6), dpi=300)
_ = plt.plot(fpr, tpr, color="darkorange", lw=2, label="ROC curve (area = %0.2f)" % metrics.auc(fpr, tpr))
_ = plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
_ = plt.xlim([-0.01, 1.01])
_ = plt.ylim([-0.01, 1.01])
_ = plt.xlabel("False Positive Rate")
_ = plt.ylabel("True Positive Rate")
_ = plt.legend(loc="lower right")
_ = plt.show()