# Import the Necessary Libraries

In [1]:
# Data and Plotting
import pandas as pd
import numpy as np
import plotly.express as px
from ucimlrepo import fetch_ucirepo 
import matplotlib.pyplot as plt
from sklearn.metrics import RocCurveDisplay, DetCurveDisplay, confusion_matrix

# Data encoding and Pipeline
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, Normalizer
from sklearn.preprocessing import KBinsDiscretizer, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline

# Dimensionality Reduction and Clustering Algorithms
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.cluster import KMeans

from sklearn.utils.class_weight import compute_sample_weight

# Data spliting and Cross Validation and Performance Metrics
from sklearn.model_selection import train_test_split as tts, StratifiedKFold, GridSearchCV
from sklearn.model_selection import cross_val_score, cross_validate, KFold
from sklearn.metrics import recall_score, f1_score, balanced_accuracy_score, roc_auc_score
from sklearn.metrics import  make_scorer,  precision_score, accuracy_score

# models
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression as LR
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.neural_network import MLPClassifier as FNN

from collections import defaultdict as dd
from sklearn.utils import resample

from scipy.stats import wilcoxon
from statsmodels.stats.contingency_tables import mcnemar

%matplotlib notebook

# Perform Wilcoxon Signed-Rank Test

In [2]:
data = pd.read_csv("Thyroid_Diff.csv")

X = data.drop("Recurred", axis=1)
y = data["Recurred"]
y = y.map({"No":0,"Yes":1})

# Define preprocessor
num_features = list(X.columns[:1])
cat_features = list(X.columns[1:])

# Define the numerical transformer
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')), 
    ("scaler", MinMaxScaler())
])

# Define the categorical transformer
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), 
    ('onehot', OneHotEncoder(sparse_output=False, handle_unknown="ignore")),
])

# Define your ColumnTransformer (preprocessor)
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, num_features),
        ('cat', categorical_transformer, cat_features)
    ],
    remainder="passthrough",
)

# Compute sample weights based on class imbalance
sample_weights = compute_sample_weight(class_weight="balanced", y=y)


# Define the models using the best parameters from hyperparameter tuning
models_pca = {
    "RF": RFC(criterion='entropy', max_depth=None, max_features='log2', min_samples_leaf=4, min_samples_split=6, 
              n_estimators=403),
    "GB": GBC(criterion='friedman_mse', learning_rate=0.35, loss='exponential', max_depth=6, n_estimators=150, tol=1e-12),
    "SVM": SVC(probability=True, C=0.1, kernel='sigmoid', tol=0.001),
    "LR": LR(C=0.36, penalty='l1', solver='liblinear'),
    "KNN": KNN(n_neighbors=17, p=2, weights='distance'),
    "FNN": FNN(activation='relu', alpha=0.5, early_stopping=True, hidden_layer_sizes=(300,), learning_rate='adaptive', 
               max_iter=10000, solver='lbfgs', tol=1e-05),
}

models_tsvd = {
    "RF": RFC(criterion='entropy', max_depth=None, max_features='log2', 
              min_samples_leaf=3, min_samples_split=6, n_estimators=400),
    "GB": GBC(criterion='squared_error', learning_rate=0.3, loss= 'log_loss', 
              max_depth=4, n_estimators=125, tol=1e-10),
    "SVM": SVC(probability=True, C=0.25, kernel='sigmoid', tol=0.01),
    "LR": LR(C=0.1, max_iter=5000, penalty='l2', solver='liblinear', tol=0.001),
    "KNN": KNN(n_neighbors=18, p=4, weights='distance'),
    "FNN": FNN(activation='identity', alpha=1.0, early_stopping=True, hidden_layer_sizes=(125, 155), 
               learning_rate='constant', max_iter=15000, solver='adam', tol=0.001),
}

pca = PCA(n_components=5)
tsvd = TruncatedSVD(n_components=5)

# Set up Stratified K-Fold Cross-Validation
n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=420)

# Dictionary to store Wilcoxon test results
wilcoxon_results = {}

for (model_name_pca, model_pca), (model_name_tsvd, model_tsvd) in zip(models_pca.items(), models_tsvd.items()):
    
    print(f"\nComparing PCA vs t-SVD for {model_name_pca}")
    
    # Build pipelines
    pipeline_pca = Pipeline([
        ("preprocessor", preprocessor),
        ("pca", pca),
        ("clf", model_pca),
    ])

    pipeline_tsvd = Pipeline([
        ("preprocessor", preprocessor),
        ("pca", tsvd),
        ("clf", model_tsvd),
    ])

    # Cross-validation scores (e.g., balanced accuracy)
    scores_pca = cross_val_score(pipeline_pca, X, y, cv=skf, scoring="balanced_accuracy")
    scores_tsvd = cross_val_score(pipeline_tsvd, X, y, cv=skf, scoring="balanced_accuracy")

    # Wilcoxon signed-rank test (non-parametric paired comparison)
    stat, p = wilcoxon(scores_pca, scores_tsvd)

    # Store the results
    wilcoxon_results[model_name_pca] = {
        "PCA_mean": scores_pca.mean(),
        "tSVD_mean": scores_tsvd.mean(),
        "statistic": stat,
        "p_value": p
    }

# Create a list to hold rows of the DataFrame
results_table = []

for model, res in wilcoxon_results.items():
    row = {
        "PCA pipeline": f"PCA-{model}",
        "t-SVD pipeline": f"tSVD-{model}",
        "p-value": round(res["p_value"], 4)
    }
    results_table.append(row)

# Convert to DataFrame
wilcoxon_df = pd.DataFrame(results_table)
wilcoxon_df.to_csv("../results/Wilcoxon Test Results.csv", index=False)

# Display the DataFrame
print(wilcoxon_df.to_string(index=False))


Comparing PCA vs t-SVD for RF





Comparing PCA vs t-SVD for GB





Comparing PCA vs t-SVD for SVM





Comparing PCA vs t-SVD for LR





Comparing PCA vs t-SVD for KNN





Comparing PCA vs t-SVD for FNN
PCA pipeline t-SVD pipeline  p-value
      PCA-RF        tSVD-RF   0.8316
      PCA-GB        tSVD-GB   0.0929
     PCA-SVM       tSVD-SVM   0.1730
      PCA-LR        tSVD-LR   0.4982
     PCA-KNN       tSVD-KNN   0.0422
     PCA-FNN       tSVD-FNN   0.0137


# Perform McNemar’s test

In [3]:
# Split into train-test sets
X_train, X_test, y_train, y_test = tts(
    X, y, test_size=0.25, stratify=y, random_state=42)

# Fit and compare pipelines for each model
results_mcnemar = {}

for (model_name_pca, model_pca), (model_name_tsvd, model_tsvd) in zip(models_pca.items(), models_tsvd.items()):
    
    print(f"\nComparing PCA vs t-SVD for {model_name_pca}")
    
    # Build PCA pipeline
    pipeline_pca = Pipeline(
        steps=[
            ("preprocessor", preprocessor),
            ("pca", pca),
            ("clf", model_pca),
        ]
    )
    
    # Build t-SVD pipeline
    pipeline_tsvd = Pipeline(
        steps=[
            ("preprocessor", preprocessor),
            ("pca", tsvd),
            ("clf", model_tsvd),
        ]
    )
    
    # Fit both on training data
    pipeline_pca.fit(X_train, y_train)
    pipeline_tsvd.fit(X_train, y_train)
    
    # Predict on test data
    y_pred_pca = pipeline_pca.predict(X_test)
    y_pred_tsvd = pipeline_tsvd.predict(X_test)
    
    # Build contingency table for McNemar's test
    table = confusion_matrix(y_pred_pca, y_pred_tsvd, labels=[0, 1])
    
    # Format table as required by McNemar's test:
    #            tSVD
    #           0    1
    #   PCA 0  [a,   b]
    #        1 [c,   d]
    b = np.sum((y_pred_pca == 1) & (y_pred_tsvd == 0))
    c = np.sum((y_pred_pca == 0) & (y_pred_tsvd == 1))
    
    mcnemar_result = mcnemar([[0, b], [c, 0]], exact=False, correction=True)

    results_mcnemar[model_name_pca] = {
        "b": b,
        "c": c,
        "statistic": mcnemar_result.statistic,
        "p_value": mcnemar_result.pvalue,
    }

# Print results
for model, result in results_mcnemar.items():
    print(f"\n{model} - McNemar’s test results:")
    print(f"b (PCA correct, t-SVD wrong): {result['b']}")
    print(f"c (t-SVD correct, PCA wrong): {result['c']}")
    print(f"Statistic: {result['statistic']:.4f}, p-value: {result['p_value']:.4f}")


Comparing PCA vs t-SVD for RF

Comparing PCA vs t-SVD for GB

Comparing PCA vs t-SVD for SVM

Comparing PCA vs t-SVD for LR

Comparing PCA vs t-SVD for KNN

Comparing PCA vs t-SVD for FNN

RF - McNemar’s test results:
b (PCA correct, t-SVD wrong): 2
c (t-SVD correct, PCA wrong): 0
Statistic: 0.5000, p-value: 0.4795

GB - McNemar’s test results:
b (PCA correct, t-SVD wrong): 5
c (t-SVD correct, PCA wrong): 5
Statistic: 0.1000, p-value: 0.7518

SVM - McNemar’s test results:
b (PCA correct, t-SVD wrong): 4
c (t-SVD correct, PCA wrong): 0
Statistic: 2.2500, p-value: 0.1336

LR - McNemar’s test results:
b (PCA correct, t-SVD wrong): 1
c (t-SVD correct, PCA wrong): 0
Statistic: 0.0000, p-value: 1.0000

KNN - McNemar’s test results:
b (PCA correct, t-SVD wrong): 2
c (t-SVD correct, PCA wrong): 1
Statistic: 0.0000, p-value: 1.0000

FNN - McNemar’s test results:
b (PCA correct, t-SVD wrong): 9
c (t-SVD correct, PCA wrong): 0
Statistic: 7.1111, p-value: 0.0077
