### Import libraries

In [None]:
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import shap

from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, balanced_accuracy_score
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn import tree
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import RandomizedSearchCV, KFold
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from numpy import unique

from lightgbm import LGBMClassifier

warnings.filterwarnings("ignore")
np.seterr(invalid='ignore')

## Data preparation

#### Train set

In [None]:
train_df = pd.read_excel(
    'FNOL_PulFib2022_train.xlsx',
    header=0)
train_df = train_df.drop(columns=["Unnamed: 0", ])

X_train = train_df.drop(columns=["CS_recommended", "ID"])
y_train = train_df["CS_recommended"]
y_train.replace([-1], [0], inplace=True)
train_df


#### Test set

In [None]:
test_df = pd.read_excel(
    'FNOL_PulFib2022_test.xlsx', header=0)
test_df = test_df.drop(columns=["Unnamed: 0", ])

X_test = test_df.drop(columns=["CS_recommended", "ID"])
y_test = test_df["CS_recommended"]
y_test.replace([-1], [0], inplace=True)
test_df


#### All data

In [None]:
all_data_df = pd.concat([train_df, test_df])

all_data_x = all_data_df.drop(columns=["CS_recommended", "ID"])
all_data_y = all_data_df["CS_recommended"]
all_data_df

#### Train distribution

In [None]:
sns.set_theme(style="whitegrid")

ax = sns.countplot(y="CS_recommended", data=train_df, alpha=1, palette="Paired")
total = train_df.shape[0]

for p in ax.patches:
    percentage = '{:.1f}%'.format(100 * p.get_width() / total)
    x = p.get_x() + p.get_width()
    y = p.get_y() + p.get_height() / 2
    ax.annotate(percentage, (x, y))

print(train_df.groupby('CS_recommended').size())
plt.show()


#### Test distribution

In [None]:
ax = sns.countplot(y="CS_recommended", data=test_df, alpha=1, palette="Paired")
total = test_df.shape[0]

for p in ax.patches:
    percentage = '{:.1f}%'.format(100 * p.get_width() / total)
    x = p.get_x() + p.get_width()
    y = p.get_y() + p.get_height() / 2
    ax.annotate(percentage, (x, y))

print(test_df.groupby('CS_recommended').size())
plt.show()

## Feature selection

#### Shap

In [None]:
reg = DecisionTreeClassifier(random_state=2)
reg.fit(X_train, y_train)

r2_score = reg.score(X_test, y_test)
print(r2_score)

shap_values = shap.TreeExplainer(reg, ).shap_values(X_test)
shap.summary_plot(shap_values, X_train, class_names=['No regression', 'Regression'], show=False)


#### K best

In [None]:
fs = SelectKBest(k=25)
fs.fit(all_data_x, all_data_y)

mask = fs.get_support()
new_features = all_data_x.columns[mask]

new_features

#### Select features

In [None]:
select_feature = unique(['pneumonia', 'CS_another_problems', 'CS', 'olfactory_loss', 'postcovid_disability',
                         'SARS - CoV - 2 IgG(qualit.)', 'SARS - CoV - 2 IgM(quant.)', 'CS_amount', 'CS_total',
                         'CS_duration_weeks', 'KO_RDW', 'VC(abs)', 'FVC( % pred)', 'FEV1(abs)', 'KO_Mo %',
                         'PEF( % pred)', 'DLCOc_SB(abs)', 'KCO_SB(abs)', 'persistent_cough', 'persistent_dyspnea',
                         'MEF25(abs)',
                         ])

x_train_filtered = X_train[select_feature]
x_test_filtered = X_test[select_feature]

## Train and test (Manual)

In [None]:
def get_scores(report_df, model, X_test, y_test, name):
    report = pd.DataFrame(data={
        'Accuracy': [model.score(X_test, y_test)],
        'Balanced accuracy': [balanced_accuracy_score(y_test, model.predict(X_test))],
        'ROC-AUC': [roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])],
        'F1': [f1_score(y_test, model.predict(X_test))],
        'Precision': [precision_score(y_test, model.predict(X_test), average="binary")],
        'Recall': [recall_score(y_test, model.predict(X_test), average="binary")]
    })

    report.index = [name]
    report_df = report_df.append(report)
    return report_df

In [None]:
df_report = pd.DataFrame()
results_dict = {}

In [None]:
alg_lr = LogisticRegression(class_weight='balanced')
alg_DT = DecisionTreeClassifier(random_state=2022, max_depth=6, min_samples_leaf=8, max_features='auto')
alg_RF = RandomForestClassifier(criterion='entropy', random_state=2022, max_depth=6)
alg_MLP = MLPClassifier(hidden_layer_sizes=(70, 8), solver='adam', random_state=42, max_iter=80, activation="relu")
alg_KN = KNeighborsClassifier(weights='distance', n_neighbors=5)
alg_SVC = SVC(random_state=42, C=3, probability=True, class_weight='balanced', kernel="sigmoid")
alg_adaboost = AdaBoostClassifier(n_estimators=12, random_state=42, learning_rate=0.8)
alg_xgboost = XGBClassifier(max_depth=10, eval_metric='logloss', seed=4)

alg_lgbm = LGBMClassifier(class_weight='balanced', learning_rate=0.5, max_depth=3)

alg_list = [alg_lr, alg_MLP, alg_DT, alg_RF, alg_KN, alg_SVC, alg_adaboost, alg_xgboost, alg_lgbm]

for reg in alg_list:
    #Train and test
    reg.fit(x_train_filtered, y_train)
    score = reg.score(x_test_filtered, y_test)
    print(type(reg).__name__)
    print(score)
    predicts = reg.predict(x_test_filtered)
    cm = confusion_matrix(y_test, predicts)

    #Plot confusion matrix
    plt.figure(figsize=(9, 9))
    sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square=True, cmap='Blues_r')
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label')
    all_sample_title = '{0}. Accuracy Score: {1}'.format(type(reg).__name__, score)
    plt.title(all_sample_title, size=15)

    #Add to report
    df_report = get_scores(df_report, reg, x_test_filtered,
                           y_test, type(reg).__name__)

In [None]:
df_report

### Train and test (with RandomizedSearchCV)

In [None]:
df_report_2 = pd.DataFrame()
num_iteration = 150
cv_inner = KFold(n_splits=5, random_state=42, shuffle=True)

In [None]:
def xval_test(model, space, df_report, draw_dt=False):
    acc_best = 0
    yhat = 0
    params_best = dict()

    for i in range(0, num_iteration):
        search = RandomizedSearchCV(model, space, scoring='roc_auc', random_state=42, cv=cv_inner, refit=True)
        result = search.fit(x_train_filtered, y_train)
        best_model = result.best_estimator_
        yhat_temp = best_model.predict(x_test_filtered)
        acc = accuracy_score(y_test, yhat_temp)
        if acc_best < acc:
            best_model_save = best_model
            params_best = result.best_params_
            acc_best = acc
            yhat = yhat_temp

    print(type(best_model_save).__name__)
    print('>acc=%.3f, cfg=%s' % (acc_best, params_best))
    print('Accuracy: %.3f ' % acc_best)
    cm = confusion_matrix(y_test, yhat, normalize='all')

    #Plot confusion matrix
    plt.figure(figsize=(9, 9))
    sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square=True, cmap='Blues_r')
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label')
    plt.title('{0}. Accuracy Score: {1}'.format(type(best_model_save).__name__, acc_best), size=15)

    # Reports
    df_report = get_scores(df_report, best_model_save, x_test_filtered,
                           y_test, type(best_model_save).__name__)

    if draw_dt:
        fig = plt.figure(figsize=(25, 20))
        _ = tree.plot_tree(best_model_save,
                           feature_names=select_feature,
                           class_names=['0', '1'],
                           filled=True)
        fig.savefig("decision_tree.svg", format='svg', dpi=1200)

    return df_report

In [None]:
alg_RF = RandomForestClassifier(random_state=2022)
alg_lr = LogisticRegression(class_weight='balanced')
alg_DT = DecisionTreeClassifier(random_state=2022)
alg_MLP = MLPClassifier(random_state=42)
alg_KN = KNeighborsClassifier()
alg_SVC = SVC(random_state=42, probability=True, class_weight='balanced')
alg_adboost = AdaBoostClassifier(random_state=42)
alg_xgboost = XGBClassifier(eval_metric='logloss')
alg_lgbm = LGBMClassifier(class_weight='balanced')

space_rf = {
    "n_estimators": [i for i in range(1, 60, 3)],
    "max_features": [i for i in range(1, 15)],
    "max_depth": [i for i in range(2, 10)],
    "criterion": ["gini", "entropy"]
}

space_lr = {
    "C": np.linspace(0, 10, 100),
    "solver": ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
}

space_dt = {
    "max_features": ["auto", "sqrt", "log2"],
    "max_depth": [i for i in range(2, 15)],
    "criterion": ["gini", "entropy"],
    "min_samples_leaf": [i for i in range(1, 20)],
}

space_mlp = {
    "solver": ['lbfgs', 'sgd', 'adam'],
    "hidden_layer_sizes": [i for i in range(2, 200)],
    "max_iter": [i for i in range(2, 150)],
}

space_kn = {
    "n_neighbors": [i for i in range(2, 30)],
    "weights": ['uniform', 'distance'],
    "algorithm": ['auto', 'ball_tree', 'kd_tree', 'brute'],
}

space_svc = {
    "C": np.linspace(0, 10, 100),
    "kernel": ['poly', 'rbf']
}

space_adboost = {
    "learning_rate": np.linspace(0, 1, 10),
    "n_estimators": [i for i in range(1, 30)]
}

space_xgboost = {
    "max_depth": [3, 4, 5, 7],
    "learning_rate": [0.1, 0.01, 0.05],
    "gamma": [0, 0.25, 1],
    "reg_lambda": [0, 1, 10],
    "scale_pos_weight": [1, 3, 5],
    "subsample": [0.8],
    "colsample_bytree": [0.5],
}

space_lgbm = {'learning_rate': [0.005, 0.1],
              'num_leaves': [i for i in range(1, 20)],
              }



In [None]:
df_report_2 = xval_test(alg_RF, space_rf, df_report_2)
df_report_2 = xval_test(alg_lr, space_lr, df_report_2)
df_report_2 = xval_test(alg_DT, space_dt, df_report_2, True)
df_report_2 = xval_test(alg_MLP, space_mlp, df_report_2)
df_report_2 = xval_test(alg_KN, space_kn, df_report_2)
df_report_2 = xval_test(alg_SVC, space_svc, df_report_2)
df_report_2 = xval_test(alg_adboost, space_adboost, df_report_2)
df_report_2 = xval_test(alg_xgboost, space_xgboost, df_report_2)
df_report_2 = xval_test(alg_lgbm, space_lgbm, df_report_2)

In [None]:
df_report_2