# Import the Necessary Libraries

In [1]:
# Data and Plotting
import pandas as pd
import numpy as np
import plotly.express as px
from ucimlrepo import fetch_ucirepo 
import matplotlib.pyplot as plt
from sklearn.metrics import RocCurveDisplay, DetCurveDisplay

# Data encoding and Pipeline
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, Normalizer
from sklearn.preprocessing import KBinsDiscretizer, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline

# Dimensionality Reduction and Clustering Algorithms
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.cluster import KMeans

from sklearn.utils.class_weight import compute_sample_weight

# Data spliting and Cross Validation and Performance Metrics
from sklearn.model_selection import train_test_split as tts, StratifiedKFold, GridSearchCV
from sklearn.model_selection import cross_val_score, cross_validate, KFold
from sklearn.metrics import recall_score, f1_score, balanced_accuracy_score, roc_auc_score
from sklearn.metrics import  make_scorer,  precision_score, accuracy_score

# models
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression as LR
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.neural_network import MLPClassifier as FNN

from collections import defaultdict as dd
from sklearn.utils import resample

%matplotlib notebook

# Load the Dataset

In [2]:
data = pd.read_csv("Thyroid_Diff.csv")

X = data.drop("Recurred", axis=1)
y = data["Recurred"]
y = y.map({"No":0,"Yes":1})

X_train, X_test, y_train, y_test = tts(X, y, 
                                       train_size=.75, 
                                       random_state=321
                                      )

X_train

Recurred
0    210
1     77
Name: count, dtype: int64

In [None]:
y_train.value_counts()

# Define the Data Preprocessing Encoders

In [3]:
# Define preprocessor
num_features = list(X.columns[:1])
cat_features = list(X.columns[1:])

# Define the numerical transformer
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')), 
    ("scaler", MinMaxScaler())
])

# Define the categorical transformer
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), 
    ('onehot', OneHotEncoder(sparse_output=False, handle_unknown="ignore")),
])

# Define your ColumnTransformer (preprocessor)
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, num_features),
        ('cat', categorical_transformer, cat_features)
    ],
    remainder="passthrough",
)

# Compute sample weights based on class imbalance
sample_weights = compute_sample_weight(class_weight="balanced", y=y_train)

# Define the Models and Metrics

In [4]:
# Define the models using the best parameters from hyperparameter tuning
models_pca = {
    "RF": RFC(criterion='entropy', max_depth=None, max_features='log2', min_samples_leaf=4, min_samples_split=6, 
              n_estimators=403),
    "GB": GBC(criterion='friedman_mse', learning_rate=0.35, loss='exponential', max_depth=6, n_estimators=150, tol=1e-12),
    "SVM": SVC(probability=True, C=0.1, kernel='sigmoid', tol=0.001),
    "LR": LR(C=0.36, penalty='l1', solver='liblinear'),
    "KNN": KNN(n_neighbors=17, p=2, weights='distance'),
    "FNN": FNN(activation='relu', alpha=0.5, early_stopping=True, hidden_layer_sizes=(300,), learning_rate='adaptive', 
               max_iter=10000, solver='lbfgs', tol=1e-05),
}

models_tsvd = {
    "RF": RFC(criterion='entropy', max_depth=None, max_features='log2', 
              min_samples_leaf=3, min_samples_split=6, n_estimators=400),
    "GB": GBC(criterion='squared_error', learning_rate=0.3, loss= 'log_loss', 
              max_depth=4, n_estimators=125, tol=1e-10),
    "SVM": SVC(probability=True, C=0.25, kernel='sigmoid', tol=0.01),
    "LR": LR(C=0.1, max_iter=5000, penalty='l2', solver='liblinear', tol=0.001),
    "KNN": KNN(n_neighbors=18, p=4, weights='distance'),
    "FNN": FNN(activation='identity', alpha=1.0, early_stopping=True, hidden_layer_sizes=(125, 155), 
               learning_rate='constant', max_iter=15000, solver='adam', tol=0.001),
}

from functools import partial
metrics = {
    "Balanced Accuracy": balanced_accuracy_score,
    "F1 Score": partial(f1_score, average='weighted'),
    "ROC AUC": roc_auc_score,
    "Sensitivity": partial(recall_score, pos_label=1),
    "Specificity": partial(recall_score, pos_label=0),
    "Precision": partial(precision_score, average='weighted'),
}

# Define the list of metrics for models evaluation using 10-fold CV
ten_fold_CV_metrics = {
    "Balanced Accuracy": "balanced_accuracy",
    "F1 Score": "f1_macro",
    "ROC AUC": "roc_auc",
    "Sensitivity": make_scorer(recall_score, pos_label=1),
    "Specificity":  make_scorer(recall_score, pos_label=0),
    "Precision": "precision"
}

# Define the Function to Calculate the 95% CI of the Performance Metrics of the Classification Pipelines on the Test Set Using Bootstrapping

In [5]:
# Function to calculate CI for a metric using bootstrapping

def calculate_bootstrap_ci(y_true, y_pred, metric, n_iterations=1000, alpha=0.05, random_state=42):
    """
    Calculates the (1-alpha)*100% confidence interval for a given metric using bootstrapping.

    Args:
        y_true (array-like): True labels.
        y_pred (array-like): Predicted labels (for accuracy and F1) or predicted probabilities (for AUC).
        metric (callable): The metric function (e.g., accuracy_score, roc_auc_score, f1_score).
        n_iterations (int): Number of bootstrap iterations.
        alpha (float): Significance level (e.g., 0.05 for a 95% CI).
        random_state (int, optional): Random seed for reproducibility. Defaults to None.

    Returns:
        tuple: A tuple containing the lower and upper bounds of the confidence interval.
    """
    n_samples = len(y_true)
    bootstrap_scores = []
    rng = np.random.RandomState(random_state)

    for _ in range(n_iterations):
        indices = rng.choice(range(n_samples), size=n_samples, replace=True)
        y_true_resampled = np.array(y_true)[indices]
        y_pred_resampled = np.array(y_pred)[indices]
        score = metric(y_true_resampled, y_pred_resampled)
        bootstrap_scores.append(score)
        bootstrap_mean = np.mean(bootstrap_scores)

    lower_percentile = alpha / 2 * 100
    upper_percentile = (1 - alpha / 2) * 100
    lower_bound = np.percentile(bootstrap_scores, lower_percentile)
    upper_bound = np.percentile(bootstrap_scores, upper_percentile)

    return bootstrap_mean, lower_bound, upper_bound

# Define the Function to Calculate the 95% CI of the Performance Metrics of the Classification Pipelines  in Stratified 10-Fold CV

In [6]:
def compute_confidence_interval(scores, confidence=0.95):
    # Calculate the standard error
    mean_score = np.mean(scores)
    stderr = np.std(scores) / np.sqrt(len(scores))
    
    # Calculate the z-score for 95% confidence
    z_score = 1.96  # for 95% CI
    
    # Confidence interval calculation
    lower_bound = mean_score - z_score * stderr
    upper_bound = mean_score + z_score * stderr
    
    return mean_score, lower_bound, upper_bound

# PCA-Classifuication Pipeline

## a. Model evaluation on testing set

In [7]:
pca = PCA(n_components=5)

# Store results
results_pca_test_set = {}

for model_name, model in models_pca.items():
    print(f"\rEvaluating {model_name}...", end="", flush=True)
    
    # Combine the dimensionality reduction, and model into a pipeline
    pipeline = Pipeline(
        steps = [
            ("preprocessor", preprocessor),
            ("pca", pca),
            ("clf", model),
        ])
    pipeline.fit(X_train, y_train)
    
    y_pred = pipeline.predict(X_test)
    
    
    model_results = {}   
    for metric_name, metric_func in metrics.items():
        mean, lower, upper = calculate_bootstrap_ci(y_test, y_pred, metric_func)
                       
        model_results[metric_name] = {'mean_val': mean, 'CI_lower': lower, 'CI_upper': upper}

    results_pca_test_set[model_name] = model_results
    
    print(f"\rEvaluation completed for {model_name}")

print("All Evaluation completed.")

# print()
# for model_name, metricss in results_pca_test_set.items():
#     print(f"Results for {model_name}:")
#     for metric_name, ci_values in metricss.items():
#         print(f"{metric_name} 'mean': {ci_values['mean_val']:.3f}, CI: [{ci_values['CI_lower']:.3f}, {ci_values['CI_upper']:.3f}]")
#     print()


# Print Results in a dataframe

# Create a list to hold rows
rows = []

for model_name, metricss in results_pca_test_set.items():
    row = {"Model": model_name}
    for metric_name, ci_values in metricss.items():
        mean = ci_values['mean_val']
        lower = ci_values['CI_lower']
        upper = ci_values['CI_upper']
        formatted = f"{mean:.3f} ({lower:.3f}–{upper:.3f})"
        row[metric_name] = formatted
    rows.append(row)

# Create DataFrame
df_results = pd.DataFrame(rows)
df_results.set_index("Model", inplace=True)

# Display or export
df_results.to_csv("../results/Performance of PCA-Pipeline on the Test Set.csv", index=False)
df_results

Evaluation completed for RF
Evaluation completed for GB
Evaluation completed for SVM
Evaluation completed for LR
Evaluation completed for KNN
Evaluation completed for FNN
All Evaluation completed.


Unnamed: 0_level_0,Balanced Accuracy,F1 Score,ROC AUC,Sensitivity,Specificity,Precision
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
RF,0.905 (0.828–0.968),0.926 (0.870–0.979),0.905 (0.828–0.968),0.841 (0.692–0.964),0.969 (0.922–1.000),0.929 (0.874–0.979)
GB,0.865 (0.786–0.938),0.885 (0.820–0.947),0.865 (0.786–0.938),0.807 (0.657–0.931),0.923 (0.848–0.983),0.887 (0.822–0.947)
SVM,0.936 (0.875–0.985),0.937 (0.886–0.979),0.936 (0.875–0.985),0.936 (0.839–1.000),0.937 (0.877–0.985),0.940 (0.890–0.981)
LR,0.953 (0.897–0.993),0.958 (0.916–0.990),0.953 (0.897–0.993),0.936 (0.839–1.000),0.969 (0.925–1.000),0.959 (0.916–0.990)
KNN,0.913 (0.844–0.970),0.927 (0.873–0.979),0.913 (0.844–0.970),0.872 (0.735–0.973),0.954 (0.895–1.000),0.928 (0.874–0.979)
FNN,0.929 (0.868–0.977),0.928 (0.876–0.969),0.929 (0.868–0.977),0.934 (0.833–1.000),0.924 (0.859–0.984),0.932 (0.882–0.972)


## b. Model evaluation in the stratified 10-fold CV using the entire dataset

In [8]:
# Set up Stratified K-Fold Cross-Validation
n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=420)

# Define the best Dimensionality Reduction Technique
pca = PCA(n_components=5)

results_pca_10fold_cv = {}

for model_name, model in models_tsvd.items():
    
    print(f"\rRunning {n_splits}-fold CV for {model_name}...", end="", flush=True)
    
    # Combine the dimensionality reduction, and model into a pipeline
    pipeline = Pipeline(
        steps = [
            ("preprocessor", preprocessor),
            ("pca", pca),
            ("clf", model),
        ])
    
    model_results = {}
    for metric_name, metric in ten_fold_CV_metrics.items():
        scores = cross_val_score(pipeline, X, y, cv=skf, scoring=metric)
        mean, lower_bound, upper_bound = compute_confidence_interval(scores)
        model_results[metric_name] = {'mean_val': mean, 'CI_lower': lower_bound, 'CI_upper': upper_bound}
        
    results_pca_10fold_cv[model_name] = model_results
    
    print(f"\rCompleted {n_splits}-fold CV for {model_name}")

print(f"\rAll {n_splits}-fold CV Completed")

# print()
# Print Results
# for model_name, metricss in results_pca_10fold_cv.items():
#     print(f"Results for {model_name}:")
#     for metric_name, ci_values in metricss.items():
#         print(f"{metric_name} 'mean': {ci_values['mean_val']:.3f}, CI: [{ci_values['CI_lower']:.3f}, {ci_values['CI_upper']:.3f}]")
#     print()

# Print Results in a dataframe
# Create a list to hold rows
rows = []

for model_name, metricss in results_pca_10fold_cv.items():
    row = {"Model": model_name}
    for metric_name, ci_values in metricss.items():
        mean = ci_values['mean_val']
        lower = ci_values['CI_lower']
        upper = ci_values['CI_upper']
        formatted = f"{mean:.3f} ({lower:.3f}–{upper:.3f})"
        row[metric_name] = formatted
    rows.append(row)

# Create DataFrame
df_results = pd.DataFrame(rows)
df_results.set_index("Model", inplace=True)

# Display or export
df_results.to_csv("../results/Performance of PCA-Pipeline in the 10-Fold CV.csv", index=False)
df_results

Completed 10-fold CV for RF.
Completed 10-fold CV for GB.
Completed 10-fold CV for SVM.
Completed 10-fold CV for LR.
Completed 10-fold CV for KNN.
Completed 10-fold CV for FNN.
All 10-fold CV Completed


Unnamed: 0_level_0,Balanced Accuracy,F1 Score,ROC AUC,Sensitivity,Specificity,Precision
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
RF,0.867 (0.826–0.908),0.882 (0.843–0.921),0.965 (0.948–0.981),0.798 (0.712–0.885),0.957 (0.935–0.979),0.864 (0.802–0.926)
GB,0.859 (0.824–0.894),0.861 (0.827–0.896),0.953 (0.936–0.970),0.797 (0.737–0.857),0.917 (0.881–0.953),0.808 (0.735–0.882)
SVM,0.873 (0.828–0.918),0.860 (0.821–0.900),0.961 (0.940–0.981),0.845 (0.761–0.928),0.901 (0.881–0.922),0.770 (0.725–0.816)
LR,0.859 (0.819–0.899),0.856 (0.821–0.891),0.965 (0.950–0.980),0.798 (0.716–0.880),0.920 (0.900–0.940),0.800 (0.763–0.838)
KNN,0.865 (0.827–0.903),0.869 (0.836–0.903),0.959 (0.940–0.979),0.788 (0.709–0.867),0.942 (0.927–0.957),0.843 (0.812–0.875)
FNN,0.859 (0.824–0.893),0.836 (0.789–0.884),0.945 (0.927–0.962),0.835 (0.742–0.929),0.912 (0.891–0.934),0.733 (0.685–0.781)


# t-SVD-Classification Pipelines

## a. Model evaluation on testing set

In [9]:
tsvd = TruncatedSVD(n_components=5)

# Store results
results_tsvd_test_set = {}

for model_name, model in models_tsvd.items():
    print(f"\rEvaluating {model_name}...", end="", flush=True)
    
    # Combine the dimensionality reduction, and model into a pipeline
    pipeline = Pipeline(
        steps = [
            ("preprocessor", preprocessor),
            ("pca", tsvd),
            ("clf", model),
        ])
    pipeline.fit(X_train, y_train)
    
    y_pred = pipeline.predict(X_test)
    
    model_results = {}   
    for metric_name, metric_func in metrics.items():
        mean, lower, upper = calculate_bootstrap_ci(y_test, y_pred, metric_func)
                       
        model_results[metric_name] = {'mean_val': mean, 'CI_lower': lower, 'CI_upper': upper}

    results_tsvd_test_set[model_name] = model_results
    
    print(f"\rEvaluation completed for {model_name}")

print("All Evaluation completed.")

# print()
# Print Results
# for model_name, metricss in results_tsvd_test_set.items():
#     print(f"Results for {model_name}:")
#     for metric_name, ci_values in metricss.items():
#         print(f"{metric_name} 'mean': {ci_values['mean_val']:.3f}, CI: [{ci_values['CI_lower']:.3f}, {ci_values['CI_upper']:.3f}]")
#     print()


# Print Results in a dataframe
# Create a list to hold rows
rows = []

for model_name, metricss in results_tsvd_test_set.items():
    row = {"Model": model_name}
    for metric_name, ci_values in metricss.items():
        mean = ci_values['mean_val']
        lower = ci_values['CI_lower']
        upper = ci_values['CI_upper']
        formatted = f"{mean:.3f} ({lower:.3f}–{upper:.3f})"
        row[metric_name] = formatted
    rows.append(row)

# Create DataFrame
df_results = pd.DataFrame(rows)
df_results.set_index("Model", inplace=True)

# Display or export
df_results.to_csv("../results/Performance of tSVD-Pipeline on the Test Set.csv", index=False)
df_results

Evaluation completed for RF
Evaluation completed for GB
Evaluation completed for SVM
Evaluation completed for LR
Evaluation completed for KNN
Evaluation completed for FNN
All Evaluation completed.


Unnamed: 0_level_0,Balanced Accuracy,F1 Score,ROC AUC,Sensitivity,Specificity,Precision
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
RF,0.920 (0.852–0.976),0.937 (0.883–0.979),0.920 (0.852–0.976),0.871 (0.741–0.969),0.969 (0.922–1.000),0.938 (0.887–0.980)
GB,0.882 (0.799–0.950),0.897 (0.832–0.958),0.882 (0.799–0.950),0.839 (0.694–0.962),0.924 (0.859–0.984),0.899 (0.837–0.958)
SVM,0.928 (0.859–0.983),0.947 (0.895–0.990),0.928 (0.859–0.983),0.872 (0.733–0.971),0.984 (0.952–1.000),0.950 (0.905–0.990)
LR,0.944 (0.880–0.992),0.958 (0.914–0.990),0.944 (0.880–0.992),0.904 (0.781–1.000),0.984 (0.952–1.000),0.959 (0.917–0.990)
KNN,0.913 (0.844–0.970),0.927 (0.873–0.979),0.913 (0.844–0.970),0.872 (0.735–0.973),0.954 (0.895–1.000),0.928 (0.874–0.979)
FNN,0.822 (0.741–0.902),0.876 (0.803–0.946),0.822 (0.741–0.902),0.644 (0.481–0.805),1.000 (1.000–1.000),0.903 (0.861–0.951)


## b. Model evaluation in the stratified 10-fold CV using the entire dataset

In [10]:
# Set up Stratified K-Fold Cross-Validation
n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=420)

# Define the best Dimensionality Reduction Technique from Method 3
tsvd = TruncatedSVD(n_components=5)

results_tsvd_10fold_cv = {}

for model_name, model in models_tsvd.items():
    
    print(f"\rRunning {n_splits}-fold CV for {model_name}...", end="", flush=True)
    
    # Combine the dimensionality reduction, and model into a pipeline
    pipeline = Pipeline(
        steps = [
            ("preprocessor", preprocessor),
            ("pca", tsvd),
            ("clf", model),
        ])
    
    model_results = {}
    for metric_name, metric in ten_fold_CV_metrics.items():
        scores = cross_val_score(pipeline, X, y, cv=skf, scoring=metric)
        mean, lower_bound, upper_bound = compute_confidence_interval(scores)
        model_results[metric_name] = {'mean_val': mean, 'CI_lower': lower, 'CI_upper': upper}
        
    results_tsvd_10fold_cv[model_name] = model_results
    
    print(f"\rCompleted {n_splits}-fold CV for {model_name}")

print(f"\rAll {n_splits}-fold CV Completed")

# print()
# Print Results
# for model_name, metricss in results_tsvd_10fold_cv.items():
#     print(f"Results for {model_name}:")
#     for metric_name, ci_values in metricss.items():
#         print(f"{metric_name} 'mean': {ci_values['mean_val']:.3f}, CI: [{ci_values['CI_lower']:.3f}, {ci_values['CI_upper']:.3f}]")
#     print()


# Print Results in a dataframe
# Create a list to hold rows
rows = []

for model_name, metricss in results_tsvd_10fold_cv.items():
    row = {"Model": model_name}
    for metric_name, ci_values in metricss.items():
        mean = ci_values['mean_val']
        lower = ci_values['CI_lower']
        upper = ci_values['CI_upper']
        formatted = f"{mean:.3f} ({lower:.3f}–{upper:.3f})"
        row[metric_name] = formatted
    rows.append(row)

# Create DataFrame
df_results = pd.DataFrame(rows)
df_results.set_index("Model", inplace=True)

# Display or export
df_results.to_csv("../results/Performance of tSVD-Pipeline in the 10-Fold CV.csv", index=False)
df_results

Completed 10-fold CV for RF.
Completed 10-fold CV for GB.
Completed 10-fold CV for SVM.
Completed 10-fold CV for LR.
Completed 10-fold CV for KNN.
Completed 10-fold CV for FNN.
All 10-fold CV Completed


Unnamed: 0_level_0,Balanced Accuracy,F1 Score,ROC AUC,Sensitivity,Specificity,Precision
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
RF,0.877 (0.861–0.951),0.879 (0.861–0.951),0.967 (0.861–0.951),0.788 (0.861–0.951),0.953 (0.861–0.951),0.877 (0.861–0.951)
GB,0.882 (0.861–0.951),0.880 (0.861–0.951),0.958 (0.861–0.951),0.835 (0.861–0.951),0.935 (0.861–0.951),0.852 (0.861–0.951)
SVM,0.848 (0.861–0.951),0.843 (0.861–0.951),0.961 (0.861–0.951),0.780 (0.861–0.951),0.916 (0.861–0.951),0.783 (0.861–0.951)
LR,0.854 (0.861–0.951),0.859 (0.861–0.951),0.965 (0.861–0.951),0.770 (0.861–0.951),0.938 (0.861–0.951),0.832 (0.861–0.951)
KNN,0.860 (0.861–0.951),0.868 (0.861–0.951),0.952 (0.861–0.951),0.770 (0.861–0.951),0.949 (0.861–0.951),0.862 (0.861–0.951)
FNN,0.805 (0.861–0.951),0.854 (0.861–0.951),0.962 (0.861–0.951),0.677 (0.861–0.951),0.964 (0.861–0.951),0.913 (0.861–0.951)
