In [None]:
# Calculate and print evaluation metrics
def evaluate(y_true, y_pred):
    # print(f"\n{method_name} Metrics:")
    return (accuracy_score(y_true, y_pred),
    precision_score(y_true, y_pred, average='weighted'),
    recall_score(y_true, y_pred, average='weighted'),
    f1_score(y_true, y_pred, average='weighted'))

def compute_si(X, y):
    """
    Compute the Separation Index (SI) for the given features and labels.
    SI = (1 / m) * sum(delta(li, li*)), where li* is the label of the nearest neighbor.
    """
    m = X.shape[0]  # Number of samples
    distances = pairwise_distances(X)  # Compute pairwise distances
    np.fill_diagonal(distances, np.inf)  # Ignore self-distances by setting them to infinity
    nearest_neighbor_indices = np.argmin(distances, axis=1)  # Indices of nearest neighbors
    nearest_neighbor_labels = y[nearest_neighbor_indices]  # Labels of nearest neighbors
    delta = (nearest_neighbor_labels == y).astype(int)  # Kronecker delta
    si = np.mean(delta)  # Compute SI
    return si

# Forward selection based on SI
def forward_selection_si(X, y, max_features):
    """
    Perform forward feature selection using Separation Index (SI).
    """
    selected_features = []
    remaining_features = list(X.columns)
    best_si = -np.inf  # Initialize with a very low SI

    while len(selected_features) < max_features and remaining_features:
        best_candidate = None
        best_si = -np.inf

        for feature in remaining_features:
            current_features = selected_features + [feature]
            X_subset = X[current_features].values
            si = compute_si(X_subset, y)

            if si > best_si:
                best_si = si
                best_candidate = feature

        if best_candidate:
            selected_features.append(best_candidate)
            remaining_features.remove(best_candidate)
        else:
            break

    return selected_features

In [None]:
!pip install ucimlrepo

from ucimlrepo import fetch_ucirepo
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.decomposition import SparsePCA
import numpy as np
import tensorflow as tf
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import pairwise_distances


##


# Assume that fetch_ucirepo and necessary dataset fetching utilities are defined
# Fetch dataset
darwin = fetch_ucirepo(id=732)

# Data
X = darwin.data.features.iloc[:, 1:]  # Removing the first feature as it is the ID of participant
y = darwin.data.targets.iloc[:, 0]  # Assuming y is single-column

# Convert categorical columns to numerical
non_numeric_cols = X.select_dtypes(include=['object']).columns
if len(non_numeric_cols) > 0:
    for col in non_numeric_cols:
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col])

# Encode target
y = LabelEncoder().fit_transform(y) if y.dtype == 'object' else y

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=6)

# Data augmentation (if necessary)
if False:
    target_size = 500
    current_size = X_train.shape[0]

    if current_size < target_size:
        samples_needed = target_size - current_size
        # Sample additional data from X_train with replacement
        additional_X = X_train.sample(n=samples_needed, replace=True, random_state=42)
        # Sample additional labels directly from y_train
        additional_y = np.random.choice(y_train, size=samples_needed, replace=True)

        # Concatenate the original and augmented data
        X_train = pd.concat([X_train, additional_X], ignore_index=True)
        y_train = np.concatenate([y_train, additional_y])

# Preprocessing: Low variance filtering, feature selection, and optional PCA
# Remove low variance features
numeric_cols = X_train.select_dtypes(include=[np.number]).columns
variances = X_train[numeric_cols].var()
low_variance_mask = variances > 0
selected_numeric_cols = numeric_cols[low_variance_mask]

# Ensure that non-numeric columns exist in X_train for union operation
valid_non_numeric_cols = [col for col in non_numeric_cols if col in X_train.columns]

# Filter train and test sets
X_train_filtered = X_train[selected_numeric_cols.union(valid_non_numeric_cols)]
X_test_filtered = X_test[selected_numeric_cols.union(valid_non_numeric_cols)]


# Remove highly correlated features
correlation_matrix = X_train_filtered.corr()
correlation_threshold = 0.98
upper_triangle = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool))
correlated_features = [column for column in upper_triangle.columns if any(upper_triangle[column] > correlation_threshold)]
X_train_filtered = X_train_filtered.loc[:, ~X_train_filtered.columns.isin(correlated_features)]
X_test_filtered = X_test_filtered.loc[:, ~X_test_filtered.columns.isin(correlated_features)]


# Apply forward selection using SI
max_features = 20  # Define the maximum number of features to select
selected_features = forward_selection_si(X_train_filtered, y_train, max_features=max_features)

# Filter train and test sets based on selected features
X_train_SI = X_train_filtered[selected_features]
X_test_SI = X_test_filtered[selected_features]

# Feature Selection: Use SelectKBest to select top features based on ANOVA F-value
from sklearn.feature_selection import SelectKBest, f_classif
select_kbest = SelectKBest(score_func=f_classif, k=20)  # Select top 40 features
X_train_ANOVA = pd.DataFrame(select_kbest.fit_transform(X_train_filtered, y_train), columns=[f'feature_{i}' for i in range(20)])
X_test_ANOVA = pd.DataFrame(select_kbest.transform(X_test_filtered), columns=[f'feature_{i}' for i in range(20)])

# Features selected by Forward Selection SI
features_si = selected_features  # Already available from forward_selection_si

# Features selected by ANOVA
anova_selected_indices = select_kbest.get_support(indices=True)
features_anova = X_train_filtered.columns[anova_selected_indices].tolist()
print(f"anova_selected_indices {anova_selected_indices}")
print(f"features_anova {features_anova}")

# Compare the two methods
features_in_both = set(features_si).intersection(features_anova)  # Features common to both methods
features_only_in_si = set(features_si).difference(features_anova)  # Features unique to Forward Selection SI
features_only_in_anova = set(features_anova).difference(features_si)  # Features unique to ANOVA

# Display results
print("Features selected by Forward Selection SI:", features_si)
print("Features selected by ANOVA:", features_anova)
print("Common features:", features_in_both)
print("Features only in Forward Selection SI:", features_only_in_si)
print("Features only in ANOVA:", features_only_in_anova)

# Standardize features
scaler = StandardScaler()
X_train_SI = scaler.fit_transform(X_train_SI)
X_test_SI = scaler.transform(X_test_SI)


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_ANOVA)
X_test_scaled = scaler.transform(X_test_ANOVA)

# Apply PCA to reduce dimensions and retain 80% of variance
pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Concatenation Fusion: Combine original filtered features with PCA features
X_train_concat = np.concatenate((X_train_scaled, X_train_pca), axis=1)
X_test_concat = np.concatenate((X_test_scaled, X_test_pca), axis=1)

# Sparse Representation Fusion
sparse_pca = SparsePCA(n_components=20, random_state=42)
X_train_sparse = sparse_pca.fit_transform(X_train_scaled)
X_test_sparse = sparse_pca.transform(X_test_scaled)

# Weighted Fusion: Assign weights to PCA and original features
weight_pca = 0.7
weight_original = 0.3
X_train_weighted = np.hstack((weight_original * X_train_scaled, weight_pca * X_train_pca))
X_test_weighted = np.hstack((weight_original * X_test_scaled, weight_pca * X_test_pca))

# Prepare traditional classifiers with different fused datasets
fused_datasets = {
    "SI" : (np.array(X_train_SI), np.array(X_test_SI)),
    "PCA": (X_train_pca, X_test_pca),
    "Concatenation": (X_train_concat, X_test_concat),
    "Sparse": (X_train_sparse, X_test_sparse),
    "Weighted": (X_train_weighted, X_test_weighted)
}
classifiers = {}

for feature_name in fused_datasets.keys():
    classifiers.update({
        f"{feature_name} Random Forest": RandomForestClassifier(random_state=42),
        f"{feature_name} XGBoost": XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
        f"{feature_name} Decision Tree": DecisionTreeClassifier(random_state=42),
        f"{feature_name} SVM": SVC(probability=True, random_state=42),
        f"{feature_name} KNN": KNeighborsClassifier(),
        f"{feature_name} Logistic Regression": LogisticRegression(max_iter=1000),
        f"{feature_name} Naive Bayes": GaussianNB(),
        f"{feature_name} Gradient Boost": GradientBoostingClassifier(random_state=42),
    })

# Additional methods (majority voting, weighted voting, probability averaging)
weights = {
    "Random Forest": 2,
    "Logistic Regression": 1.5,
    "XGBoost": 2,
    "Decision Tree": 1,
    "SVM": 1,
    "KNN": 1,
    "Naive Bayes": 1,
    "Gradient Boost": 1,
}

results = {}
predictions = {}
probabilities = {}

for fusion_name, (X_train_fused, X_test_fused) in fused_datasets.items():
    print(f"Number of features for {fusion_name} fusion: {X_train_fused.shape[1]}")
    print(f"SI is {compute_si(X_train_fused, y_train):.3f} for {fusion_name} features")
    for clf_name in weights.keys():
        clf = classifiers[f"{fusion_name} {clf_name}"]
        clf.fit(X_train_fused, y_train)
        y_pred = clf.predict(X_test_fused)
        predictions[clf_name] =  y_pred
        probabilities[clf_name] = clf.predict_proba(X_test_fused)[:, 1]




        # Calculate various evaluation metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted', zero_division=1)
        recall = recall_score(y_test, y_pred, average='weighted', zero_division=1)
        f1 = f1_score(y_test, y_pred, average='weighted', zero_division=1)

        # Store metrics in the results dictionary
        results[f"{fusion_name} + {clf_name}"] = {
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1": f1
        }

        # Identify and print misclassified samples
        misclassified_indices = np.where(y_test != y_pred)[0]
        if len(misclassified_indices) > 0:
            print(f"\nMisclassified samples for {fusion_name} + {clf_name}:")
            for idx in misclassified_indices:
                print(f"  Index: {idx}, True Label: {y_test[idx]}, Predicted Label: {y_pred[idx]}")
    pred_df = pd.DataFrame(predictions)

    weighted_votes = np.zeros(len(X_test_fused))

    for name, weight in weights.items():
        weighted_votes += pred_df[name] * weight
    majority_vote_pred = pred_df.mode(axis=1)[0]  # Most common prediction for each instance

    weighted_vote_pred = (weighted_votes >= (sum(weights.values()) / 2)).astype(int)
    avg_prob = sum(weight * probabilities[name] for name, weight in weights.items()) / sum(weights.values())
    prob_avg_pred = (avg_prob >= 0.5).astype(int)
    accuracy, precision, recall,f1 =  evaluate(y_test, majority_vote_pred)
    results[f"{fusion_name} + Majority Voting"] = {
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1": f1
    }
    accuracy, precision, recall,f1 =  evaluate(y_test, weighted_vote_pred)
    results[f"{fusion_name} + Weighted Voting"] = {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }
    accuracy, precision, recall,f1 =  evaluate(y_test, prob_avg_pred)
    results[f"{fusion_name} + Probability Voting"] = {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }
y_predict_MCF= np.zeros((y_test.shape[0],len(fused_datasets)))
j = 0
for fusion_name, (X_train_fused, X_test_fused) in fused_datasets.items():

    # Define and train a simple MLP model
    mlp_model = Sequential()
    mlp_model.add(Dense(4, input_dim=X_train_fused.shape[1], activation='relu'))
    mlp_model.add(Dense(7, activation='relu'))
    mlp_model.add(Dense(len(weights), activation='softmax'))

    mlp_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    # Define number of steps and folds for the grid layout
    num_steps = 11
    num_folds = 12

    # # Initialize figure with a 5x5 grid for subplots
    # fig, axs = plt.subplots(num_steps, num_folds, figsize=(20, 20))
    # fig.suptitle('Model Accuracy and Loss Across Steps and Folds', fontsize=16)

    # Set up K-Fold cross-validation and list to store history data
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=6)
    # Initialize lists to store mean metrics across folds for each step
    mean_train_accuracy_per_step = []
    mean_val_accuracy_per_step = []
    mean_train_loss_per_step = []
    mean_val_loss_per_step = []
    for step in range(num_steps):
        train_accuracy_per_fold = []
        val_accuracy_per_fold = []
        train_loss_per_fold = []
        val_loss_per_fold = []
        for fold, (train_index, test_index) in enumerate(kf.split(X_train_fused)):
            print(f"\nProcessing Fold {fold + 1}")

            # Split data into train and test sets for the current fold
            _X_train, X_val = X_train_fused[train_index], X_train_fused[test_index]
            y_train_mlp, y_val = y_train[train_index], y_train[test_index]

            _classifiers ={
                "Random Forest": RandomForestClassifier(random_state=42),
                "Logistic Regression": LogisticRegression(max_iter=1000),
                "XGBoost": XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
                "Decision Tree": DecisionTreeClassifier(random_state=42),
                "SVM": SVC(probability=True, random_state=42),
                "KNN": KNeighborsClassifier(),
                "Naive Bayes": GaussianNB(),
                "Gradient Boost": GradientBoostingClassifier(random_state=42),
            }

            # Train and evaluate each classifier
            for clf_name, clf in _classifiers.items():
                clf.fit(_X_train, y_train_mlp)
                # y_pred = clf.predict_proba(X_test_fused)
            y_for_mlp_val = np.zeros((X_val.shape[0],len(_classifiers)))
            # y_for_mlp_train = np.zeros((_X_train.shape[0],len(_classifiers)))

            y_for_mlp_test = np.zeros((X_test_fused.shape[0],len(_classifiers)))
            for idx,_x in enumerate(X_val):
                if y_val[idx] == 1:
                    _y = np.array([clf.predict_proba(_x.reshape(1,-1))*(results[f"{fusion_name} + {_}"]["f1"])**4 for _,clf in _classifiers.items()])[:,0,1]
                else:
                    _y = np.array([clf.predict_proba(_x.reshape(1,-1))*(results[f"{fusion_name} + {_}"]["f1"])**4 for _,clf in _classifiers.items()])[:,0,0]
                y_for_mlp_val[idx,np.argmax(_y)] = 1
                print(_y)
            y_for_mlp_val = np.argmax(y_for_mlp_val,axis=1)


            for idx,_x in enumerate(X_test_fused):
                if y_test[idx] == 1:
                    _y = np.array([clf.predict_proba(_x.reshape(1,-1))*(results[f"{fusion_name} + {_}"]["f1"])**4 for _,clf in _classifiers.items()])[:,0,1]
                else:
                    _y = np.array([clf.predict_proba(_x.reshape(1,-1))*(results[f"{fusion_name} + {_}"]["f1"])**4 for _,clf in _classifiers.items()])[:,0,0]
                y_for_mlp_test[idx,np.argmax(_y)] = 1
            y_for_mlp_test = np.argmax(y_for_mlp_test,axis=1)
            print(y_for_mlp_val)

            history = mlp_model.fit(X_val , y_for_mlp_val, epochs=15, batch_size=16, verbose=1, validation_data=(X_test_fused, y_for_mlp_test))

            # Capture the final train and validation accuracy and loss for each fold in this step
            train_accuracy_per_fold.append(history.history['accuracy'][-1])
            val_accuracy_per_fold.append(history.history['val_accuracy'][-1])
            train_loss_per_fold.append(history.history['loss'][-1])
            val_loss_per_fold.append(history.history['val_loss'][-1])

        # Calculate the mean metrics across folds for this step
        mean_train_accuracy_per_step.append(np.mean(train_accuracy_per_fold))
        mean_val_accuracy_per_step.append(np.mean(val_accuracy_per_fold))
        mean_train_loss_per_step.append(np.mean(train_loss_per_fold))
        mean_val_loss_per_step.append(np.mean(val_loss_per_fold))

    # Plot mean accuracy over steps
    plt.figure(figsize=(10, 5))

    # Plot training and validation accuracies
    plt.plot(range(1, num_steps + 1), mean_train_accuracy_per_step, marker='o', linestyle='-', label='Mean Train Accuracy')
    plt.plot(range(1, num_steps + 1), mean_val_accuracy_per_step, marker='o', linestyle='--', label='Mean Validation Accuracy')
    plt.title('Mean Training and Validation Accuracy Across Steps')
    plt.xlabel('Step')
    plt.ylabel('Mean Accuracy')
    plt.legend()
    plt.show()

    # Plot mean loss over steps
    plt.figure(figsize=(10, 5))

    # Plot training and validation losses
    plt.plot(range(1, num_steps + 1), mean_train_loss_per_step, marker='o', linestyle='-', label='Mean Train Loss')
    plt.plot(range(1, num_steps + 1), mean_val_loss_per_step, marker='o', linestyle='--', label='Mean Validation Loss')
    plt.title('Mean Training and Validation Loss Across Steps')
    plt.xlabel('Step')
    plt.ylabel('Mean Loss')
    plt.legend()
    plt.show()

    selected_classifier = np.argmax(mlp_model.predict(X_test_fused), axis=1)
    _y_pred = np.zeros((y_test.shape[0],len(_classifiers)))

    for i, clf_name in enumerate(weights.keys()):
        clf = classifiers[f"{fusion_name} {clf_name}"]
        _y_pred[:,i] = clf.predict(X_test_fused)
    # y_pred = _y_pred[:,selected_classifier]
    y_pred = _y_pred[np.arange(_y_pred.shape[0]), selected_classifier]
    y_predict_MCF[:,j] = y_pred
    j += 1
    # Calculate various evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=1)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=1)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=1)

    # Store metrics in the results dictionary
    results[f"{fusion_name} + MCF"] = {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }
    print(results[f"{fusion_name} + MCF"])
    # Identify and print misclassified samples
    misclassified_indices = np.where(y_test != y_pred)[0]
    if len(misclassified_indices) > 0:
        print(f"\nMisclassified samples for {fusion_name} + MCF:")
        for idx in misclassified_indices:
            print(f"  Index: {idx}, True Label: {y_test[idx]}, Predicted Label: {y_pred[idx]}")

In [None]:
!pip install ucimlrepo

from ucimlrepo import fetch_ucirepo
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.decomposition import SparsePCA
import numpy as np
import tensorflow as tf
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import pairwise_distances


##


# Assume that fetch_ucirepo and necessary dataset fetching utilities are defined
# Fetch dataset
darwin = fetch_ucirepo(id=732)

# Data
X = darwin.data.features.iloc[:, 1:]  # Removing the first feature as it is the ID of participant
y = darwin.data.targets.iloc[:, 0]  # Assuming y is single-column

# Convert categorical columns to numerical
non_numeric_cols = X.select_dtypes(include=['object']).columns
if len(non_numeric_cols) > 0:
    for col in non_numeric_cols:
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col])

# Encode target
y = LabelEncoder().fit_transform(y) if y.dtype == 'object' else y

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=6)

# Data augmentation (if necessary)
if False:
    target_size = 500
    current_size = X_train.shape[0]

    if current_size < target_size:
        samples_needed = target_size - current_size
        # Sample additional data from X_train with replacement
        additional_X = X_train.sample(n=samples_needed, replace=True, random_state=42)
        # Sample additional labels directly from y_train
        additional_y = np.random.choice(y_train, size=samples_needed, replace=True)

        # Concatenate the original and augmented data
        X_train = pd.concat([X_train, additional_X], ignore_index=True)
        y_train = np.concatenate([y_train, additional_y])

# Preprocessing: Low variance filtering, feature selection, and optional PCA
# Remove low variance features
numeric_cols = X_train.select_dtypes(include=[np.number]).columns
variances = X_train[numeric_cols].var()
low_variance_mask = variances > 0
selected_numeric_cols = numeric_cols[low_variance_mask]

# Ensure that non-numeric columns exist in X_train for union operation
valid_non_numeric_cols = [col for col in non_numeric_cols if col in X_train.columns]

# Filter train and test sets
X_train_filtered = X_train[selected_numeric_cols.union(valid_non_numeric_cols)]
X_test_filtered = X_test[selected_numeric_cols.union(valid_non_numeric_cols)]


# Remove highly correlated features
correlation_matrix = X_train_filtered.corr()
correlation_threshold = 0.98
upper_triangle = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool))
correlated_features = [column for column in upper_triangle.columns if any(upper_triangle[column] > correlation_threshold)]
X_train_filtered = X_train_filtered.loc[:, ~X_train_filtered.columns.isin(correlated_features)]
X_test_filtered = X_test_filtered.loc[:, ~X_test_filtered.columns.isin(correlated_features)]


# Apply forward selection using SI
max_features = 20  # Define the maximum number of features to select
selected_features = forward_selection_si(X_train_filtered, y_train, max_features=max_features)

# Filter train and test sets based on selected features
X_train_SI = X_train_filtered[selected_features]
X_test_SI = X_test_filtered[selected_features]

# Feature Selection: Use SelectKBest to select top features based on ANOVA F-value
from sklearn.feature_selection import SelectKBest, f_classif
select_kbest = SelectKBest(score_func=f_classif, k=20)  # Select top 40 features
X_train_ANOVA = pd.DataFrame(select_kbest.fit_transform(X_train_filtered, y_train), columns=[f'feature_{i}' for i in range(20)])
X_test_ANOVA = pd.DataFrame(select_kbest.transform(X_test_filtered), columns=[f'feature_{i}' for i in range(20)])

# Features selected by Forward Selection SI
features_si = selected_features  # Already available from forward_selection_si

# Features selected by ANOVA
anova_selected_indices = select_kbest.get_support(indices=True)
features_anova = X_train_filtered.columns[anova_selected_indices].tolist()
print(f"anova_selected_indices {anova_selected_indices}")
print(f"features_anova {features_anova}")

# Compare the two methods
features_in_both = set(features_si).intersection(features_anova)  # Features common to both methods
features_only_in_si = set(features_si).difference(features_anova)  # Features unique to Forward Selection SI
features_only_in_anova = set(features_anova).difference(features_si)  # Features unique to ANOVA

# Display results
print("Features selected by Forward Selection SI:", features_si)
print("Features selected by ANOVA:", features_anova)
print("Common features:", features_in_both)
print("Features only in Forward Selection SI:", features_only_in_si)
print("Features only in ANOVA:", features_only_in_anova)

# Standardize features
scaler = StandardScaler()
X_train_SI = scaler.fit_transform(X_train_SI)
X_test_SI = scaler.transform(X_test_SI)


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_ANOVA)
X_test_scaled = scaler.transform(X_test_ANOVA)

# Apply PCA to reduce dimensions and retain 80% of variance
pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Concatenation Fusion: Combine original filtered features with PCA features
X_train_concat = np.concatenate((X_train_scaled, X_train_pca), axis=1)
X_test_concat = np.concatenate((X_test_scaled, X_test_pca), axis=1)

# Sparse Representation Fusion
sparse_pca = SparsePCA(n_components=20, random_state=42)
X_train_sparse = sparse_pca.fit_transform(X_train_scaled)
X_test_sparse = sparse_pca.transform(X_test_scaled)

# Weighted Fusion: Assign weights to PCA and original features
weight_pca = 0.7
weight_original = 0.3
X_train_weighted = np.hstack((weight_original * X_train_scaled, weight_pca * X_train_pca))
X_test_weighted = np.hstack((weight_original * X_test_scaled, weight_pca * X_test_pca))

# Prepare traditional classifiers with different fused datasets
fused_datasets = {
    "SI" : (np.array(X_train_SI), np.array(X_test_SI)),
    "PCA": (X_train_pca, X_test_pca),
    "Concatenation": (X_train_concat, X_test_concat),
    "Sparse": (X_train_sparse, X_test_sparse),
    "Weighted": (X_train_weighted, X_test_weighted)
}
classifiers = {}

for feature_name in fused_datasets.keys():
    classifiers.update({
        f"{feature_name} Random Forest": RandomForestClassifier(random_state=42),
        f"{feature_name} XGBoost": XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
        f"{feature_name} Decision Tree": DecisionTreeClassifier(random_state=42),
        f"{feature_name} SVM": SVC(probability=True, random_state=42),
        f"{feature_name} KNN": KNeighborsClassifier(),
        f"{feature_name} Logistic Regression": LogisticRegression(max_iter=1000),
        f"{feature_name} Naive Bayes": GaussianNB(),
        f"{feature_name} Gradient Boost": GradientBoostingClassifier(random_state=42),
    })

# Additional methods (majority voting, weighted voting, probability averaging)
weights = {
    "Random Forest": 2,
    "Logistic Regression": 1.5,
    "XGBoost": 2,
    "Decision Tree": 1,
    "SVM": 1,
    "KNN": 1,
    "Naive Bayes": 1,
    "Gradient Boost": 1,
}

results = {}
predictions = {}
probabilities = {}

for fusion_name, (X_train_fused, X_test_fused) in fused_datasets.items():
    print(f"Number of features for {fusion_name} fusion: {X_train_fused.shape[1]}")
    print(f"SI is {compute_si(X_train_fused, y_train):.3f} for {fusion_name} features")
    for clf_name in weights.keys():
        clf = classifiers[f"{fusion_name} {clf_name}"]
        clf.fit(X_train_fused, y_train)
        y_pred = clf.predict(X_test_fused)
        predictions[clf_name] =  y_pred
        probabilities[clf_name] = clf.predict_proba(X_test_fused)[:, 1]




        # Calculate various evaluation metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted', zero_division=1)
        recall = recall_score(y_test, y_pred, average='weighted', zero_division=1)
        f1 = f1_score(y_test, y_pred, average='weighted', zero_division=1)

        # Store metrics in the results dictionary
        results[f"{fusion_name} + {clf_name}"] = {
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1": f1
        }

        # Identify and print misclassified samples
        misclassified_indices = np.where(y_test != y_pred)[0]
        if len(misclassified_indices) > 0:
            print(f"\nMisclassified samples for {fusion_name} + {clf_name}:")
            for idx in misclassified_indices:
                print(f"  Index: {idx}, True Label: {y_test[idx]}, Predicted Label: {y_pred[idx]}")
    pred_df = pd.DataFrame(predictions)

    weighted_votes = np.zeros(len(X_test_fused))

    for name, weight in weights.items():
        weighted_votes += pred_df[name] * weight
    majority_vote_pred = pred_df.mode(axis=1)[0]  # Most common prediction for each instance

    weighted_vote_pred = (weighted_votes >= (sum(weights.values()) / 2)).astype(int)
    avg_prob = sum(weight * probabilities[name] for name, weight in weights.items()) / sum(weights.values())
    prob_avg_pred = (avg_prob >= 0.5).astype(int)
    accuracy, precision, recall,f1 =  evaluate(y_test, majority_vote_pred)
    results[f"{fusion_name} + Majority Voting"] = {
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1": f1
    }
    accuracy, precision, recall,f1 =  evaluate(y_test, weighted_vote_pred)
    results[f"{fusion_name} + Weighted Voting"] = {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }
    accuracy, precision, recall,f1 =  evaluate(y_test, prob_avg_pred)
    results[f"{fusion_name} + Probability Voting"] = {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }
y_predict_MCF= np.zeros((y_test.shape[0],len(fused_datasets)))
j = 0
for fusion_name, (X_train_fused, X_test_fused) in fused_datasets.items():

    # Define and train a simple MLP model
    mlp_model = Sequential()
    mlp_model.add(Dense(4, input_dim=X_train_fused.shape[1], activation='relu'))
    mlp_model.add(Dense(5, activation='relu'))
    mlp_model.add(Dense(len(weights), activation='softmax'))

    mlp_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    # Define number of steps and folds for the grid layout
    num_steps = 7
    num_folds = 12

    # # Initialize figure with a 5x5 grid for subplots
    # fig, axs = plt.subplots(num_steps, num_folds, figsize=(20, 20))
    # fig.suptitle('Model Accuracy and Loss Across Steps and Folds', fontsize=16)

    # Set up K-Fold cross-validation and list to store history data
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=6)
    # Initialize lists to store mean metrics across folds for each step
    mean_train_accuracy_per_step = []
    mean_val_accuracy_per_step = []
    mean_train_loss_per_step = []
    mean_val_loss_per_step = []
    for step in range(num_steps):
        train_accuracy_per_fold = []
        val_accuracy_per_fold = []
        train_loss_per_fold = []
        val_loss_per_fold = []
        for fold, (train_index, test_index) in enumerate(kf.split(X_train_fused)):
            print(f"\nProcessing Fold {fold + 1}")

            # Split data into train and test sets for the current fold
            _X_train, X_val = X_train_fused[train_index], X_train_fused[test_index]
            y_train_mlp, y_val = y_train[train_index], y_train[test_index]

            _classifiers ={
                "Random Forest": RandomForestClassifier(random_state=42),
                "Logistic Regression": LogisticRegression(max_iter=1000),
                "XGBoost": XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
                "Decision Tree": DecisionTreeClassifier(random_state=42),
                "SVM": SVC(probability=True, random_state=42),
                "KNN": KNeighborsClassifier(),
                "Naive Bayes": GaussianNB(),
                "Gradient Boost": GradientBoostingClassifier(random_state=42),
            }

            # Train and evaluate each classifier
            for clf_name, clf in _classifiers.items():
                clf.fit(_X_train, y_train_mlp)
                # y_pred = clf.predict_proba(X_test_fused)
            y_for_mlp_val = np.zeros((X_val.shape[0],len(_classifiers)))
            # y_for_mlp_train = np.zeros((_X_train.shape[0],len(_classifiers)))

            y_for_mlp_test = np.zeros((X_test_fused.shape[0],len(_classifiers)))
            for idx,_x in enumerate(X_val):
                if y_val[idx] == 1:
                    _y = np.array([clf.predict_proba(_x.reshape(1,-1))*(results[f"{fusion_name} + {_}"]["f1"])**4 for _,clf in _classifiers.items()])[:,0,1]
                else:
                    _y = np.array([clf.predict_proba(_x.reshape(1,-1))*(results[f"{fusion_name} + {_}"]["f1"])**4 for _,clf in _classifiers.items()])[:,0,0]
                y_for_mlp_val[idx,np.argmax(_y)] = 1
                print(_y)
            y_for_mlp_val = np.argmax(y_for_mlp_val,axis=1)


            for idx,_x in enumerate(X_test_fused):
                if y_test[idx] == 1:
                    _y = np.array([clf.predict_proba(_x.reshape(1,-1))*(results[f"{fusion_name} + {_}"]["f1"])**4 for _,clf in _classifiers.items()])[:,0,1]
                else:
                    _y = np.array([clf.predict_proba(_x.reshape(1,-1))*(results[f"{fusion_name} + {_}"]["f1"])**4 for _,clf in _classifiers.items()])[:,0,0]
                y_for_mlp_test[idx,np.argmax(_y)] = 1
            y_for_mlp_test = np.argmax(y_for_mlp_test,axis=1)
            print(y_for_mlp_val)

            history = mlp_model.fit(X_val , y_for_mlp_val, epochs=15, batch_size=16, verbose=1, validation_data=(X_test_fused, y_for_mlp_test))

            # Capture the final train and validation accuracy and loss for each fold in this step
            train_accuracy_per_fold.append(history.history['accuracy'][-1])
            val_accuracy_per_fold.append(history.history['val_accuracy'][-1])
            train_loss_per_fold.append(history.history['loss'][-1])
            val_loss_per_fold.append(history.history['val_loss'][-1])

        # Calculate the mean metrics across folds for this step
        mean_train_accuracy_per_step.append(np.mean(train_accuracy_per_fold))
        mean_val_accuracy_per_step.append(np.mean(val_accuracy_per_fold))
        mean_train_loss_per_step.append(np.mean(train_loss_per_fold))
        mean_val_loss_per_step.append(np.mean(val_loss_per_fold))

    # Plot mean accuracy over steps
    plt.figure(figsize=(10, 5))

    # Plot training and validation accuracies
    plt.plot(range(1, num_steps + 1), mean_train_accuracy_per_step, marker='o', linestyle='-', label='Mean Train Accuracy')
    plt.plot(range(1, num_steps + 1), mean_val_accuracy_per_step, marker='o', linestyle='--', label='Mean Validation Accuracy')
    plt.title('Mean Training and Validation Accuracy Across Steps')
    plt.xlabel('Step')
    plt.ylabel('Mean Accuracy')
    plt.legend()
    plt.show()

    # Plot mean loss over steps
    plt.figure(figsize=(10, 5))

    # Plot training and validation losses
    plt.plot(range(1, num_steps + 1), mean_train_loss_per_step, marker='o', linestyle='-', label='Mean Train Loss')
    plt.plot(range(1, num_steps + 1), mean_val_loss_per_step, marker='o', linestyle='--', label='Mean Validation Loss')
    plt.title('Mean Training and Validation Loss Across Steps')
    plt.xlabel('Step')
    plt.ylabel('Mean Loss')
    plt.legend()
    plt.show()

    selected_classifier = np.argmax(mlp_model.predict(X_test_fused), axis=1)
    _y_pred = np.zeros((y_test.shape[0],len(_classifiers)))

    for i, clf_name in enumerate(weights.keys()):
        clf = classifiers[f"{fusion_name} {clf_name}"]
        _y_pred[:,i] = clf.predict(X_test_fused)
    # y_pred = _y_pred[:,selected_classifier]
    y_pred = _y_pred[np.arange(_y_pred.shape[0]), selected_classifier]
    y_predict_MCF[:,j] = y_pred
    j += 1
    # Calculate various evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=1)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=1)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=1)

    # Store metrics in the results dictionary
    results[f"{fusion_name} + MCF"] = {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }
    print(results[f"{fusion_name} + MCF"])
    # Identify and print misclassified samples
    misclassified_indices = np.where(y_test != y_pred)[0]
    if len(misclassified_indices) > 0:
        print(f"\nMisclassified samples for {fusion_name} + MCF:")
        for idx in misclassified_indices:
            print(f"  Index: {idx}, True Label: {y_test[idx]}, Predicted Label: {y_pred[idx]}")

In [None]:
!pip install ucimlrepo

from ucimlrepo import fetch_ucirepo
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.decomposition import SparsePCA
import numpy as np
import tensorflow as tf
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

##


# Assume that fetch_ucirepo and necessary dataset fetching utilities are defined
# Fetch dataset
darwin = fetch_ucirepo(id=732)

# Data
X = darwin.data.features.iloc[:, 1:]  # Removing the first feature as it is the ID of participant
y = darwin.data.targets.iloc[:, 0]  # Assuming y is single-column

# Convert categorical columns to numerical
non_numeric_cols = X.select_dtypes(include=['object']).columns
if len(non_numeric_cols) > 0:
    for col in non_numeric_cols:
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col])

# Encode target
y = LabelEncoder().fit_transform(y) if y.dtype == 'object' else y

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=6)

# Data augmentation (if necessary)
if False:
    target_size = 500
    current_size = X_train.shape[0]

    if current_size < target_size:
        samples_needed = target_size - current_size
        # Sample additional data from X_train with replacement
        additional_X = X_train.sample(n=samples_needed, replace=True, random_state=42)
        # Sample additional labels directly from y_train
        additional_y = np.random.choice(y_train, size=samples_needed, replace=True)

        # Concatenate the original and augmented data
        X_train = pd.concat([X_train, additional_X], ignore_index=True)
        y_train = np.concatenate([y_train, additional_y])

# Preprocessing: Low variance filtering, feature selection, and optional PCA
# Remove low variance features
numeric_cols = X_train.select_dtypes(include=[np.number]).columns
variances = X_train[numeric_cols].var()
low_variance_mask = variances > 0
selected_numeric_cols = numeric_cols[low_variance_mask]

# Ensure that non-numeric columns exist in X_train for union operation
valid_non_numeric_cols = [col for col in non_numeric_cols if col in X_train.columns]

# Filter train and test sets
X_train_filtered = X_train[selected_numeric_cols.union(valid_non_numeric_cols)]
X_test_filtered = X_test[selected_numeric_cols.union(valid_non_numeric_cols)]

print("X_train_filtered shape:", X_train_filtered.shape)
print("X_test_filtered shape:", X_test_filtered.shape)



# Remove highly correlated features
correlation_matrix = X_train_filtered.corr()
correlation_threshold = 0.80
upper_triangle = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool))
correlated_features = [column for column in upper_triangle.columns if any(upper_triangle[column] > correlation_threshold)]
X_train_filtered = X_train_filtered.loc[:, ~X_train_filtered.columns.isin(correlated_features)]
X_test_filtered = X_test_filtered.loc[:, ~X_test_filtered.columns.isin(correlated_features)]

# Feature Selection: Use SelectKBest to select top features based on ANOVA F-value
from sklearn.feature_selection import SelectKBest, f_classif
select_kbest = SelectKBest(score_func=f_classif, k=20)  # Select top 20 features
X_train_filtered = pd.DataFrame(select_kbest.fit_transform(X_train_filtered, y_train), columns=[f'feature_{i}' for i in range(20)])
X_test_filtered = pd.DataFrame(select_kbest.transform(X_test_filtered), columns=[f'feature_{i}' for i in range(20)])

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_filtered)
X_test_scaled = scaler.transform(X_test_filtered)

# Apply PCA to reduce dimensions and retain 80% of variance
pca = PCA(n_components=0.80)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Concatenation Fusion: Combine original filtered features with PCA features
X_train_concat = np.concatenate((X_train_scaled, X_train_pca), axis=1)
X_test_concat = np.concatenate((X_test_scaled, X_test_pca), axis=1)

# Sparse Representation Fusion
sparse_pca = SparsePCA(n_components=10, random_state=42)
X_train_sparse = sparse_pca.fit_transform(X_train_scaled)
X_test_sparse = sparse_pca.transform(X_test_scaled)

# Weighted Fusion: Assign weights to PCA and original features
weight_pca = 0.7
weight_original = 0.3
X_train_weighted = np.hstack((weight_original * X_train_scaled, weight_pca * X_train_pca))
X_test_weighted = np.hstack((weight_original * X_test_scaled, weight_pca * X_test_pca))

# Prepare traditional classifiers with different fused datasets
fused_datasets = {
    "PCA": (X_train_pca, X_test_pca),
    "Concatenation": (X_train_concat, X_test_concat),
    "Sparse": (X_train_sparse, X_test_sparse),
    "Weighted": (X_train_weighted, X_test_weighted)
}
classifiers = {}

for feature_name in fused_datasets.keys():
    classifiers.update({
        f"{feature_name} Random Forest": RandomForestClassifier(random_state=42),
        f"{feature_name} XGBoost": XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
        f"{feature_name} Decision Tree": DecisionTreeClassifier(random_state=42),
        f"{feature_name} SVM": SVC(probability=True, random_state=42),
        f"{feature_name} KNN": KNeighborsClassifier(),
        f"{feature_name} Logistic Regression": LogisticRegression(max_iter=1000),
        f"{feature_name} Naive Bayes": GaussianNB(),
        f"{feature_name} Gradient Boost": GradientBoostingClassifier(random_state=42),
    })

# Additional methods (majority voting, weighted voting, probability averaging)
weights = {
    "Random Forest": 2,
    "Logistic Regression": 1.5,
    "XGBoost": 2,
    "Decision Tree": 1,
    "SVM": 1,
    "KNN": 1,
    "Naive Bayes": 1,
    "Gradient Boost": 1,
}

results = {}
predictions = {}
probabilities = {}

for fusion_name, (X_train_fused, X_test_fused) in fused_datasets.items():
    print(f"Number of features for {fusion_name} fusion: {X_train_fused.shape[1]}")

    for clf_name in weights.keys():
        clf = classifiers[f"{fusion_name} {clf_name}"]
        clf.fit(X_train_fused, y_train)
        y_pred = clf.predict(X_test_fused)
        predictions[clf_name] =  y_pred
        probabilities[clf_name] = clf.predict_proba(X_test_fused)[:, 1]




        # Calculate various evaluation metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted', zero_division=1)
        recall = recall_score(y_test, y_pred, average='weighted', zero_division=1)
        f1 = f1_score(y_test, y_pred, average='weighted', zero_division=1)

        # Store metrics in the results dictionary
        results[f"{fusion_name} + {clf_name}"] = {
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1": f1
        }

        # Identify and print misclassified samples
        misclassified_indices = np.where(y_test != y_pred)[0]
        if len(misclassified_indices) > 0:
            print(f"\nMisclassified samples for {fusion_name} + {clf_name}:")
            for idx in misclassified_indices:
                print(f"  Index: {idx}, True Label: {y_test[idx]}, Predicted Label: {y_pred[idx]}")
    pred_df = pd.DataFrame(predictions)

    weighted_votes = np.zeros(len(X_test_fused))

    for name, weight in weights.items():
        weighted_votes += pred_df[name] * weight
    majority_vote_pred = pred_df.mode(axis=1)[0]  # Most common prediction for each instance

    weighted_vote_pred = (weighted_votes >= (sum(weights.values()) / 2)).astype(int)
    avg_prob = sum(weight * probabilities[name] for name, weight in weights.items()) / sum(weights.values())
    prob_avg_pred = (avg_prob >= 0.5).astype(int)
    accuracy, precision, recall,f1 =  evaluate(y_test, majority_vote_pred)
    results[f"{fusion_name} + Majority Voting"] = {
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1": f1
    }
    accuracy, precision, recall,f1 =  evaluate(y_test, weighted_vote_pred)
    results[f"{fusion_name} + Weighted Voting"] = {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }
    accuracy, precision, recall,f1 =  evaluate(y_test, prob_avg_pred)
    results[f"{fusion_name} + Probability Voting"] = {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }


# Define and train a simple MLP model
mlp_model = Sequential()
mlp_model.add(Dense(5, input_dim=X_train_weighted.shape[1], activation='relu'))
mlp_model.add(Dense(5, activation='relu'))
mlp_model.add(Dense(len(weights), activation='softmax'))
print(len(weights))

mlp_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# Set up K-Fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=6)
for steps in range(5):
    for fold, (train_index, test_index) in enumerate(kf.split(X_train_weighted)):
        print(f"\nProcessing Fold {fold + 1}")

        # Split data into train and test sets for the current fold
        X_train, X_val = X_train_weighted[train_index], X_train_weighted[test_index]
        y_train_mlp, y_val = y_train[train_index], y_train[test_index]

        _classifiers ={
            "Random Forest": RandomForestClassifier(random_state=42),
            "Logistic Regression": LogisticRegression(max_iter=1000),
            "XGBoost": XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
            "Decision Tree": DecisionTreeClassifier(random_state=42),
            "SVM": SVC(probability=True, random_state=42),
            "KNN": KNeighborsClassifier(),
            "Naive Bayes": GaussianNB(),
            "Gradient Boost": GradientBoostingClassifier(random_state=42),
        }

        # Train and evaluate each classifier
        for clf_name, clf in _classifiers.items():
            clf.fit(X_train, y_train_mlp)
            # y_pred = clf.predict_proba(X_test_fused)

        y_for_mlp = np.zeros((X_val.shape[0],len(_classifiers)))
        for idx,_x in enumerate(X_val):
            # print(_y.shape)
            # print(np.argmax(_y,axis=0))
            if y_val[idx] == 1:
                _y = np.array([clf.predict_proba(_x.reshape(1,-1))*(results[f"Weighted + {_}"]["f1"])**2 for _,clf in _classifiers.items()])[:,0,1]

            else:
                _y = np.array([clf.predict_proba(_x.reshape(1,-1))*(results[f"Weighted + {_}"]["f1"])**2 for _,clf in _classifiers.items()])[:,0,0]
            y_for_mlp[idx,np.argmax(_y)] = 1
        y_for_mlp = np.argmax(y_for_mlp,axis=1)
        print(y_for_mlp)
        mlp_model.fit(X_val, y_for_mlp, epochs=50, batch_size=10, verbose=1)

selected_classifier = np.argmax(mlp_model.predict(X_test_weighted), axis=1)
_y_pred = np.zeros((y_test.shape[0],len(_classifiers)))

for i, clf_name in enumerate(weights.keys()):
    clf = classifiers[f"Weighted {clf_name}"]
    _y_pred[:,i] = clf.predict(X_test_weighted)
# y_pred = _y_pred[:,selected_classifier]
y_pred = _y_pred[np.arange(_y_pred.shape[0]), selected_classifier]
print(selected_classifier)
print(selected_classifier.shape)
print(y_pred)
print(y_pred.shape)
# Calculate various evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=1)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=1)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=1)

# Store metrics in the results dictionary
MCF_results = {
    "accuracy": accuracy,
    "precision": precision,
    "recall": recall,
    "f1": f1
}
print(MCF_results)
# Identify and print misclassified samples
misclassified_indices = np.where(y_test != y_pred)[0]
if len(misclassified_indices) > 0:
    print(f"\nMisclassified samples for {fusion_name} + {clf_name}:")
    for idx in misclassified_indices:
        print(f"  Index: {idx}, True Label: {y_test[idx]}, Predicted Label: {y_pred[idx]}")



In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

def create_performance_heatmap(results, title):
    # Create a DataFrame from the results dictionary
    metrics = ['accuracy', 'precision', 'recall', 'f1']
    data = []
    for model, scores in results.items():
        row = [scores[metric] for metric in metrics]
        data.append(row)

    df = pd.DataFrame(data, columns=metrics, index=results.keys())

    # Create heatmap
    plt.figure(figsize=(12, 8))
    sns.heatmap(df, annot=True, cmap='YlOrRd', fmt='.3f', cbar_kws={'label': 'Score'})
    plt.title(title)
    plt.xticks(rotation=45)
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.show()

def create_fusion_comparison(results):
    # Prepare data for each fusion method
    fusion_methods = ['PCA', 'Concatenation', 'Sparse', 'Weighted']
    metrics = ['accuracy', 'precision', 'recall', 'f1']

    # Create DataFrame for easier manipulation
    df = pd.DataFrame(results).T

    # Calculate mean scores for each fusion method
    fusion_means = {}
    for fusion in fusion_methods:
        # Filter rows containing the fusion method name
        fusion_data = df[df.index.str.contains(fusion)]
        fusion_means[fusion] = fusion_data.mean()

    # Create grouped bar plot
    fig, ax = plt.subplots(figsize=(12, 6))
    x = np.arange(len(fusion_methods))
    width = 0.2

    for i, metric in enumerate(metrics):
        values = [fusion_means[fusion][metric] for fusion in fusion_methods]
        ax.bar(x + i*width, values, width, label=metric.capitalize())

    ax.set_ylabel('Score')
    ax.set_title('Performance Comparison of Fusion Methods')
    ax.set_xticks(x + width * 1.5)
    ax.set_xticklabels(fusion_methods)
    ax.legend()
    plt.tight_layout()
    plt.show()

def create_classifier_comparison(results):
    classifiers = ['Random Forest', 'XGBoost', 'Decision Tree', 'SVM', 'KNN',
                  'Logistic Regression', 'Naive Bayes', 'Gradient Boost']
    metrics = ['accuracy', 'f1']

    # Create DataFrame for easier manipulation
    df = pd.DataFrame(results).T

    # Calculate mean scores for each classifier
    classifier_means = {}
    for clf in classifiers:
        # Filter rows containing the classifier name
        clf_data = df[df.index.str.contains(clf)]
        classifier_means[clf] = clf_data.mean()

    # Create grouped bar plot
    fig, ax = plt.subplots(figsize=(12, 6))
    x = np.arange(len(classifiers))
    width = 0.35

    for i, metric in enumerate(metrics):
        values = [classifier_means[clf][metric] for clf in classifiers]
        ax.bar(x + i*width, values, width, label=metric.capitalize())

    ax.set_ylabel('Score')
    ax.set_title('Performance Comparison of Classifiers')
    ax.set_xticks(x + width/2)
    ax.set_xticklabels(classifiers, rotation=45, ha='right')
    ax.legend()
    plt.tight_layout()
    plt.show()

def visualize_mcf_comparison(results, mcf_results):
    # Prepare data for visualization
    methods = ['Best Traditional', 'MCF']
    metrics = ['accuracy', 'precision', 'recall', 'f1']

    # Find best traditional method
    best_traditional = max(results.items(), key=lambda x: x[1]['f1'])

    # Create comparison data
    comparison_data = {
        'Best Traditional': best_traditional[1],
        'MCF': mcf_results
    }

    # Create bar plot
    fig, ax = plt.subplots(figsize=(10, 6))
    x = np.arange(len(methods))
    width = 0.2

    for i, metric in enumerate(metrics):
        values = [comparison_data[method][metric] for method in methods]
        ax.bar(x + i*width, values, width, label=metric.capitalize())

    ax.set_ylabel('Score')
    ax.set_title('MCF vs Best Traditional Method')
    ax.set_xticks(x + width * 1.5)
    ax.set_xticklabels(methods)
    ax.legend()
    plt.tight_layout()
    plt.show()

# Let's also create a summary function to print the best performers
def print_summary(results, mcf_results):
    print("=== Performance Summary ===")

    # Find best model for each metric
    metrics = ['accuracy', 'precision', 'recall', 'f1']
    for metric in metrics:
        best_model = max(results.items(), key=lambda x: x[1][metric])
        print(f"\nBest {metric}: {best_model[0]}")
        print(f"Score: {best_model[1][metric]:.4f}")

    print("\n=== MCF Performance ===")
    for metric in metrics:
        print(f"{metric}: {mcf_results[metric]:.4f}")

# Create all visualizations
create_performance_heatmap(results, 'Performance Metrics Heatmap')
create_fusion_comparison(results)
create_classifier_comparison(results)
visualize_mcf_comparison(results, MCF_results)
print_summary(results, MCF_results)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

def plot_confusion_matrices(results, y_test, mcf_pred, class_names=None):
    # Find best traditional model based on F1 score
    best_traditional = max(results.items(), key=lambda x: x[1]['f1'])
    best_model_name = best_traditional[0]

    # Get predictions for best traditional model
    for fusion_name, (X_train_fused, X_test_fused) in fused_datasets.items():
        if fusion_name in best_model_name:
            for clf_name, clf in classifiers.items():
                if clf_name.endswith(best_model_name.split('+')[1].strip()):
                    best_traditional_pred = clf.predict(X_test_fused)
                    break
            break

    # Create figure with two subplots
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

    # Plot confusion matrix for best traditional model
    cm1 = confusion_matrix(y_test, best_traditional_pred)
    sns.heatmap(cm1, annot=True, fmt='d', cmap='Blues', ax=ax1)
    ax1.set_title(f'Confusion Matrix\nBest Traditional Model ({best_model_name})')
    ax1.set_xlabel('Predicted')
    ax1.set_ylabel('True')
    if class_names:
        ax1.set_xticklabels(class_names)
        ax1.set_yticklabels(class_names)

    # Plot confusion matrix for MCF model
    cm2 = confusion_matrix(y_test, mcf_pred)
    sns.heatmap(cm2, annot=True, fmt='d', cmap='Blues', ax=ax2)
    ax2.set_title('Confusion Matrix\nMCF Model')
    ax2.set_xlabel('Predicted')
    ax2.set_ylabel('True')
    if class_names:
        ax2.set_xticklabels(class_names)
        ax2.set_yticklabels(class_names)

    plt.tight_layout()
    plt.show()

    # Print additional metrics
    print("\n=== Confusion Matrix Analysis ===")

    def print_metrics_from_cm(cm, model_name):
        tn, fp, fn, tp = cm.ravel()
        specificity = tn / (tn + fp)
        sensitivity = tp / (tp + fn)
        print(f"\n{model_name} Metrics:")
        print(f"True Negatives: {tn}")
        print(f"False Positives: {fp}")
        print(f"False Negatives: {fn}")
        print(f"True Positives: {tp}")
        print(f"Specificity (True Negative Rate): {specificity:.4f}")
        print(f"Sensitivity (True Positive Rate): {sensitivity:.4f}")

    print_metrics_from_cm(cm1, "Best Traditional Model")
    print_metrics_from_cm(cm2, "MCF Model")

# Plot confusion matrices
class_names = ['Class 0', 'Class 1']  # Update these names based on your actual classes
plot_confusion_matrices(results, y_test, y_pred, class_names)

## Resualts

In [None]:
!pip install ucimlrepo

from ucimlrepo import fetch_ucirepo
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.decomposition import SparsePCA
import numpy as np
import tensorflow as tf
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

##


# Assume that fetch_ucirepo and necessary dataset fetching utilities are defined
# Fetch dataset
darwin = fetch_ucirepo(id=732)

# Data
X = darwin.data.features.iloc[:, 1:]  # Removing the first feature as it is the ID of participant
y = darwin.data.targets.iloc[:, 0]  # Assuming y is single-column

# Convert categorical columns to numerical
non_numeric_cols = X.select_dtypes(include=['object']).columns
if len(non_numeric_cols) > 0:
    for col in non_numeric_cols:
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col])

# Encode target
y = LabelEncoder().fit_transform(y) if y.dtype == 'object' else y

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=6)

# Data augmentation (if necessary)
if False:
    target_size = 500
    current_size = X_train.shape[0]

    if current_size < target_size:
        samples_needed = target_size - current_size
        # Sample additional data from X_train with replacement
        additional_X = X_train.sample(n=samples_needed, replace=True, random_state=42)
        # Sample additional labels directly from y_train
        additional_y = np.random.choice(y_train, size=samples_needed, replace=True)

        # Concatenate the original and augmented data
        X_train = pd.concat([X_train, additional_X], ignore_index=True)
        y_train = np.concatenate([y_train, additional_y])

# Preprocessing: Low variance filtering, feature selection, and optional PCA
# Remove low variance features
numeric_cols = X_train.select_dtypes(include=[np.number]).columns
variances = X_train[numeric_cols].var()
low_variance_mask = variances > 0
selected_numeric_cols = numeric_cols[low_variance_mask]

# Ensure that non-numeric columns exist in X_train for union operation
valid_non_numeric_cols = [col for col in non_numeric_cols if col in X_train.columns]

# Filter train and test sets
X_train_filtered = X_train[selected_numeric_cols.union(valid_non_numeric_cols)]
X_test_filtered = X_test[selected_numeric_cols.union(valid_non_numeric_cols)]

print("X_train_filtered shape:", X_train_filtered.shape)
print("X_test_filtered shape:", X_test_filtered.shape)



# Remove highly correlated features
correlation_matrix = X_train_filtered.corr()
correlation_threshold = 0.80
upper_triangle = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool))
correlated_features = [column for column in upper_triangle.columns if any(upper_triangle[column] > correlation_threshold)]
X_train_filtered = X_train_filtered.loc[:, ~X_train_filtered.columns.isin(correlated_features)]
X_test_filtered = X_test_filtered.loc[:, ~X_test_filtered.columns.isin(correlated_features)]

# Feature Selection: Use SelectKBest to select top features based on ANOVA F-value
from sklearn.feature_selection import SelectKBest, f_classif
select_kbest = SelectKBest(score_func=f_classif, k=20)  # Select top 20 features
X_train_filtered = pd.DataFrame(select_kbest.fit_transform(X_train_filtered, y_train), columns=[f'feature_{i}' for i in range(20)])
X_test_filtered = pd.DataFrame(select_kbest.transform(X_test_filtered), columns=[f'feature_{i}' for i in range(20)])

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_filtered)
X_test_scaled = scaler.transform(X_test_filtered)

# Apply PCA to reduce dimensions and retain 80% of variance
pca = PCA(n_components=0.80)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Concatenation Fusion: Combine original filtered features with PCA features
X_train_concat = np.concatenate((X_train_scaled, X_train_pca), axis=1)
X_test_concat = np.concatenate((X_test_scaled, X_test_pca), axis=1)

# Sparse Representation Fusion
sparse_pca = SparsePCA(n_components=10, random_state=42)
X_train_sparse = sparse_pca.fit_transform(X_train_scaled)
X_test_sparse = sparse_pca.transform(X_test_scaled)

# Weighted Fusion: Assign weights to PCA and original features
weight_pca = 0.7
weight_original = 0.3
X_train_weighted = np.hstack((weight_original * X_train_scaled, weight_pca * X_train_pca))
X_test_weighted = np.hstack((weight_original * X_test_scaled, weight_pca * X_test_pca))

# Prepare traditional classifiers with different fused datasets
fused_datasets = {
    "PCA": (X_train_pca, X_test_pca),
    "Concatenation": (X_train_concat, X_test_concat),
    "Sparse": (X_train_sparse, X_test_sparse),
    "Weighted": (X_train_weighted, X_test_weighted)
}
classifiers = {}

for feature_name in fused_datasets.keys():
    classifiers.update({
        f"{feature_name} Random Forest": RandomForestClassifier(random_state=42),
        f"{feature_name} XGBoost": XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
        f"{feature_name} Decision Tree": DecisionTreeClassifier(random_state=42),
        f"{feature_name} SVM": SVC(probability=True, random_state=42),
        f"{feature_name} KNN": KNeighborsClassifier(),
        f"{feature_name} Logistic Regression": LogisticRegression(max_iter=1000),
        f"{feature_name} Naive Bayes": GaussianNB(),
        f"{feature_name} Gradient Boost": GradientBoostingClassifier(random_state=42),
    })

# Additional methods (majority voting, weighted voting, probability averaging)
weights = {
    "Random Forest": 2,
    "Logistic Regression": 1.5,
    "XGBoost": 2,
    "Decision Tree": 1,
    "SVM": 1,
    "KNN": 1,
    "Naive Bayes": 1,
    "Gradient Boost": 1,
}

results = {}
predictions = {}
probabilities = {}

for fusion_name, (X_train_fused, X_test_fused) in fused_datasets.items():
    print(f"Number of features for {fusion_name} fusion: {X_train_fused.shape[1]}")

    for clf_name in weights.keys():
        clf = classifiers[f"{fusion_name} {clf_name}"]
        clf.fit(X_train_fused, y_train)
        y_pred = clf.predict(X_test_fused)
        predictions[clf_name] =  y_pred
        probabilities[clf_name] = clf.predict_proba(X_test_fused)[:, 1]




        # Calculate various evaluation metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted', zero_division=1)
        recall = recall_score(y_test, y_pred, average='weighted', zero_division=1)
        f1 = f1_score(y_test, y_pred, average='weighted', zero_division=1)

        # Store metrics in the results dictionary
        results[f"{fusion_name} + {clf_name}"] = {
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1": f1
        }

        # Identify and print misclassified samples
        misclassified_indices = np.where(y_test != y_pred)[0]
        if len(misclassified_indices) > 0:
            print(f"\nMisclassified samples for {fusion_name} + {clf_name}:")
            for idx in misclassified_indices:
                print(f"  Index: {idx}, True Label: {y_test[idx]}, Predicted Label: {y_pred[idx]}")
    pred_df = pd.DataFrame(predictions)

    weighted_votes = np.zeros(len(X_test_fused))

    for name, weight in weights.items():
        weighted_votes += pred_df[name] * weight
    majority_vote_pred = pred_df.mode(axis=1)[0]  # Most common prediction for each instance

    weighted_vote_pred = (weighted_votes >= (sum(weights.values()) / 2)).astype(int)
    avg_prob = sum(weight * probabilities[name] for name, weight in weights.items()) / sum(weights.values())
    prob_avg_pred = (avg_prob >= 0.5).astype(int)
    accuracy, precision, recall,f1 =  evaluate(y_test, majority_vote_pred)
    results[f"{fusion_name} + Majority Voting"] = {
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1": f1
    }
    accuracy, precision, recall,f1 =  evaluate(y_test, weighted_vote_pred)
    results[f"{fusion_name} + Weighted Voting"] = {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }
    accuracy, precision, recall,f1 =  evaluate(y_test, prob_avg_pred)
    results[f"{fusion_name} + Probability Voting"] = {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }


# Define and train a simple MLP model
mlp_model = Sequential()
mlp_model.add(Dense(3, input_dim=X_train_weighted.shape[1], activation='relu'))
mlp_model.add(Dense(3, activation='relu'))
mlp_model.add(Dense(len(weights), activation='softmax'))
print(len(weights))

mlp_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# Set up K-Fold cross-validation
kf = KFold(n_splits=9, shuffle=True, random_state=6)
for steps in range(5):
    for fold, (train_index, test_index) in enumerate(kf.split(X_train_weighted)):
        print(f"\nProcessing Fold {fold + 1}")

        # Split data into train and test sets for the current fold
        X_train, X_val = X_train_weighted[train_index], X_train_weighted[test_index]
        y_train_mlp, y_val = y_train[train_index], y_train[test_index]

        _classifiers ={
            "Random Forest": RandomForestClassifier(random_state=42),
            "Logistic Regression": LogisticRegression(max_iter=1000),
            "XGBoost": XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
            "Decision Tree": DecisionTreeClassifier(random_state=42),
            "SVM": SVC(probability=True, random_state=42),
            "KNN": KNeighborsClassifier(),
            "Naive Bayes": GaussianNB(),
            "Gradient Boost": GradientBoostingClassifier(random_state=42),
        }

        # Train and evaluate each classifier
        for clf_name, clf in _classifiers.items():
            clf.fit(X_train, y_train_mlp)
            # y_pred = clf.predict_proba(X_test_fused)

        y_for_mlp = np.zeros((X_val.shape[0],len(_classifiers)))
        for idx,_x in enumerate(X_val):
            # print(_y.shape)
            # print(np.argmax(_y,axis=0))
            if y_val[idx] == 1:
                _y = np.array([clf.predict_proba(_x.reshape(1,-1))*(results[f"Weighted + {_}"]["f1"]) for _,clf in _classifiers.items()])[:,0,1]

            else:
                _y = np.array([clf.predict_proba(_x.reshape(1,-1))*(results[f"Weighted + {_}"]["f1"]) for _,clf in _classifiers.items()])[:,0,0]
            y_for_mlp[idx,np.argmax(_y)] = 1
        y_for_mlp = np.argmax(y_for_mlp,axis=1)
        print(y_for_mlp)
        mlp_model.fit(X_val, y_for_mlp, epochs=50, batch_size=10, verbose=1)

selected_classifier = np.argmax(mlp_model.predict(X_test_weighted), axis=1)
_y_pred = np.zeros((y_test.shape[0],len(_classifiers)))

for i, clf_name in enumerate(weights.keys()):
    clf = classifiers[f"Weighted {clf_name}"]
    _y_pred[:,i] = clf.predict(X_test_weighted)
# y_pred = _y_pred[:,selected_classifier]
y_pred = _y_pred[np.arange(_y_pred.shape[0]), selected_classifier]
print(selected_classifier)
print(selected_classifier.shape)
print(y_pred)
print(y_pred.shape)
# Calculate various evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=1)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=1)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=1)

# Store metrics in the results dictionary
MCF_results = {
    "accuracy": accuracy,
    "precision": precision,
    "recall": recall,
    "f1": f1
}
print(MCF_results)
# Identify and print misclassified samples
misclassified_indices = np.where(y_test != y_pred)[0]
if len(misclassified_indices) > 0:
    print(f"\nMisclassified samples for {fusion_name} + {clf_name}:")
    for idx in misclassified_indices:
        print(f"  Index: {idx}, True Label: {y_test[idx]}, Predicted Label: {y_pred[idx]}")



In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report
import pandas as pd

def get_all_predictions():
    predictions = {}
    # Get predictions for all models
    j=0
    for fusion_name, (X_train_fused, X_test_fused) in fused_datasets.items():
        for clf_name, clf in classifiers.items():
            if clf_name.startswith(fusion_name):
                model_name = f"{fusion_name} + {clf_name.split(' ', 1)[1]}"
                predictions[model_name] = clf.predict(X_test_fused)

        # Add ensemble methods predictions
        weighted_votes = np.zeros(len(X_test_fused))
        for name, weight in weights.items():
            weighted_votes += predictions[f"{fusion_name} + {name}"] * weight
        predictions[f"{fusion_name} + Weighted Voting"] = (weighted_votes >= (sum(weights.values()) / 2)).astype(int)

        # Majority voting
        pred_df = pd.DataFrame({name: predictions[f"{fusion_name} + {name}"] for name in weights.keys()})
        predictions[f"{fusion_name} + Majority Voting"] = pred_df.mode(axis=1)[0]

    # Add MCF predictions
        predictions[f"{fusion_name} + MCF"] = y_predict_MCF[:,j].squeeze()
        j += 1

    return predictions


def create_comprehensive_metrics_table(predictions):
    metrics_data = []

    for model_name, y_pred in predictions.items():
        # Calculate confusion matrix
        cm = confusion_matrix(y_test, y_pred)
        tn, fp, fn, tp = cm.ravel()

        # Calculate primary metrics
        specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
        sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
        accuracy = (tp + tn) / (tp + tn + fp + fn)
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = sensitivity  # Sensitivity is another term for recall
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

        # Calculate F2 score, which weights recall (sensitivity) higher than precision
        f2 = (1 + 2**2) * (precision * recall) / ((2**2 * precision) + recall) if (precision + recall) > 0 else 0

        metrics_data.append({
            'Model': model_name,
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall (Sensitivity)': recall,
            'F1 Score': f1,
            'F2 Score': f2,  # Emphasis on recall, minimizing false negatives
            'Specificity': specificity,
            'True Negatives (TN)': tn,
            'False Positives (FP)': fp,
            'False Negatives (FN)': fn,
            'True Positives (TP)': tp
        })

    # Create DataFrame
    metrics_df = pd.DataFrame(metrics_data)

    # Sort primarily by F2 Score and F1 Score to prioritize models minimizing FN
    metrics_df = metrics_df.sort_values(['F2 Score', 'F1 Score'], ascending=False)

    return metrics_df

def plot_all_confusion_matrices(predictions):
    n_models = len(predictions)
    n_cols = 4
    n_rows = (n_models + n_cols - 1) // n_cols

    fig, axes = plt.subplots(n_rows, n_cols, figsize=(20, 5*n_rows))
    axes = axes.flatten()

    for i, (model_name, y_pred) in enumerate(predictions.items()):
        if i < len(axes):
            cm = confusion_matrix(y_test, y_pred)
            sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[i])
            axes[i].set_title(f'Confusion Matrix\n{model_name}')
            axes[i].set_xlabel('Predicted')
            axes[i].set_ylabel('True')

    # Remove empty subplots
    for i in range(len(predictions), len(axes)):
        fig.delaxes(axes[i])

    plt.tight_layout()
    plt.show()

def plot_performance_comparison(metrics_df):
    # Select metrics to plot
    plot_metrics = ['Accuracy', 'Precision', 'Recall', 'F1']

    # Create figure
    plt.figure(figsize=(15, 8))

    # Plot
    x = np.arange(len(metrics_df))
    width = 0.2

    for i, metric in enumerate(plot_metrics):
        plt.bar(x + i*width, metrics_df[metric], width, label=metric)

    plt.xlabel('Models')
    plt.ylabel('Score')
    plt.title('Performance Comparison Across All Models')
    plt.xticks(x + width*1.5, metrics_df['Model'], rotation=45, ha='right')
    plt.legend()
    plt.tight_layout()
    plt.show()

# Get all predictions
all_predictions = get_all_predictions()

# Create and display metrics table
metrics_df = create_comprehensive_metrics_table(all_predictions )
print("\n=== Comprehensive Metrics Table ===")
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
print(metrics_df.to_string(index=False))

# Plot confusion matrices
print("\n=== Confusion Matrices for All Models ===")
plot_all_confusion_matrices(all_predictions)

# Plot performance comparison
print("\n=== Performance Comparison ===")
plot_performance_comparison(metrics_df)

# Export metrics to CSV (optional)
metrics_df.to_csv('model_metrics.csv', index=False)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report
import pandas as pd

def get_all_predictions():
    predictions = {}
    # Get predictions for all models
    j=0
    for fusion_name, (X_train_fused, X_test_fused) in fused_datasets.items():
        for clf_name, clf in classifiers.items():
            if clf_name.startswith(fusion_name):
                model_name = f"{fusion_name} + {clf_name.split(' ', 1)[1]}"
                predictions[model_name] = clf.predict(X_test_fused)

        # Add ensemble methods predictions
        weighted_votes = np.zeros(len(X_test_fused))
        for name, weight in weights.items():
            weighted_votes += predictions[f"{fusion_name} + {name}"] * weight
        predictions[f"{fusion_name} + Weighted Voting"] = (weighted_votes >= (sum(weights.values()) / 2)).astype(int)

        # Majority voting
        pred_df = pd.DataFrame({name: predictions[f"{fusion_name} + {name}"] for name in weights.keys()})
        predictions[f"{fusion_name} + Majority Voting"] = pred_df.mode(axis=1)[0]

    # Add MCF predictions
        predictions[f"{fusion_name} + MCF"] = y_predict_MCF[:,j].squeeze()
        j += 1

    return predictions


def create_comprehensive_metrics_table(predictions):
    metrics_data = []

    for model_name, y_pred in predictions.items():
        # Calculate confusion matrix
        cm = confusion_matrix(y_test, y_pred)
        tn, fp, fn, tp = cm.ravel()

        # Calculate primary metrics
        specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
        sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
        accuracy = (tp + tn) / (tp + tn + fp + fn)
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = sensitivity  # Sensitivity is another term for recall
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

        # Calculate F2 score, which weights recall (sensitivity) higher than precision
        f2 = (1 + 2**2) * (precision * recall) / ((2**2 * precision) + recall) if (precision + recall) > 0 else 0

        metrics_data.append({
            'Model': model_name,
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall (Sensitivity)': recall,
            'F1 Score': f1,
            'F2 Score': f2,  # Emphasis on recall, minimizing false negatives
            'Specificity': specificity,
            'True Negatives (TN)': tn,
            'False Positives (FP)': fp,
            'False Negatives (FN)': fn,
            'True Positives (TP)': tp
        })

    # Create DataFrame
    metrics_df = pd.DataFrame(metrics_data)

    # Sort primarily by F2 Score and F1 Score to prioritize models minimizing FN
    metrics_df = metrics_df.sort_values(['F2 Score', 'F1 Score'], ascending=False)

    return metrics_df

def plot_all_confusion_matrices(predictions):
    n_models = len(predictions)
    n_cols = 4
    n_rows = (n_models + n_cols - 1) // n_cols

    fig, axes = plt.subplots(n_rows, n_cols, figsize=(20, 5*n_rows))
    axes = axes.flatten()

    for i, (model_name, y_pred) in enumerate(predictions.items()):
        if i < len(axes):
            cm = confusion_matrix(y_test, y_pred)
            sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[i])
            axes[i].set_title(f'Confusion Matrix\n{model_name}')
            axes[i].set_xlabel('Predicted')
            axes[i].set_ylabel('True')

    # Remove empty subplots
    for i in range(len(predictions), len(axes)):
        fig.delaxes(axes[i])

    plt.tight_layout()
    plt.show()

def plot_performance_comparison(metrics_df):
    # Select metrics to plot
    plot_metrics = ['Accuracy', 'Precision', 'Recall', 'F1']

    # Create figure
    plt.figure(figsize=(15, 8))

    # Plot
    x = np.arange(len(metrics_df))
    width = 0.2

    for i, metric in enumerate(plot_metrics):
        plt.bar(x + i*width, metrics_df[metric], width, label=metric)

    plt.xlabel('Models')
    plt.ylabel('Score')
    plt.title('Performance Comparison Across All Models')
    plt.xticks(x + width*1.5, metrics_df['Model'], rotation=45, ha='right')
    plt.legend()
    plt.tight_layout()
    plt.show()

# Get all predictions
all_predictions = get_all_predictions()

# Create and display metrics table
metrics_df = create_comprehensive_metrics_table(all_predictions )
print("\n=== Comprehensive Metrics Table ===")
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
print(metrics_df.to_string(index=False))

# Plot confusion matrices
print("\n=== Confusion Matrices for All Models ===")
plot_all_confusion_matrices(all_predictions)

# Plot performance comparison
print("\n=== Performance Comparison ===")
plot_performance_comparison(metrics_df)

# Export metrics to CSV (optional)
metrics_df.to_csv('model_metrics.csv', index=False)