In [1]:
import os
import numpy as np
import pandas as pd
from nilearn.connectome import ConnectivityMeasure
from sklearn.preprocessing import StandardScaler

In [3]:
user_dir = '/Users/xiaoqianxiao'
projectName = 'UKB'
data_dir = os.path.join(user_dir, projectName, "data")
derivatives_dir = os.path.join(data_dir, 'derivatives')
fMRIinfo_file_path = os.path.join(data_dir, 'current_anxiety_data_set.csv')
df_fMRIinfo = pd.read_csv(fMRIinfo_file_path)
participant_file_path = os.path.join(data_dir, 'participants_fMRI.csv')
df_participants = pd.read_csv(participant_file_path)
#subject_IDs = participants_df['eid']
subject_IDs = df_fMRIinfo['eid'].unique()
#for each subject:
#subject_ID = subject_IDs[3]
session_ID = 2
#load timeseries
#session_ID in range(2,4):
# Initialize lists to hold the data
X = []  # To hold the time series data
subject_id_list_ori = []  # To hold the subject IDs

# Loop through the list of subject IDs
for subject_ID_ori in subject_IDs:
    df_sub_session = pd.DataFrame()  # Initialize an empty DataFrame for each subject
    cortical_file_name = f"sub-{subject_ID_ori}_ses-{session_ID}_task-rest_space-Glasser.csv.gz"
    cortical_file_path = os.path.join(derivatives_dir, 'timeseries/current_anxiety_data_set', cortical_file_name)
    subcortical_file_name = f"sub-{subject_ID_ori}_ses-{session_ID}_task-rest_space-Tian_Subcortex_S2_3T.csv.gz"
    subcortical_file_path = os.path.join(derivatives_dir, 'timeseries/current_anxiety_data_set',subcortical_file_name)
    if os.path.exists(cortical_file_path) and os.path.exists(subcortical_file_path):
        # Load the data file and concatenate to the subject's data
        ## cortical ROIs
        df_cortical_all = pd.read_csv(cortical_file_path, compression='gzip', index_col=0, header=0)
        ##subcortical ROIs
        df_subcortical_all = pd.read_csv(subcortical_file_path, compression='gzip', index_col=0, header=0)
        # gather all ROIs into one dataframe
        df_all = pd.concat([df_cortical_all.transpose(), df_subcortical_all.transpose()], axis=1)
        # Append combined data and subject ID to respective lists
        X.append(df_all.values)  # Store the time-series matrix
        subject_id_list_ori.append(subject_ID_ori)
    else:
        print(f"Missing files for subject {subject_ID_ori}, session {session_ID}.")
X_cleaned = []
# Step 1: Handle NaN values by filling them with the mean of each feature
for i, data in enumerate(X):
    # Skip empty arrays
    if data.size == 0:
        print(f"Subject {i + 1} has an empty array. Skipping.")
        continue

    if np.all(np.isnan(data)):  # Check if all values are NaN
        print(f"Subject {i + 1} has all NaN values. Filling with zeros.")
        data_filled = np.zeros_like(data)
        X_cleaned.append(data_filled)
        continue

    # Fill NaNs column-wise with the mean of each feature
    data_filled = np.copy(data)  # Make a copy of the data to modify
    for j in range(data.shape[1]):  # Iterate over each feature (column)
        if np.isnan(data[:, j]).any():  # If the column contains NaNs
            if np.all(np.isnan(data[:, j])):  # If all values in the column are NaN
                print(f"Column {j} in Subject {i + 1} has all NaN values. Filling with zeros.")
                data_filled[:, j] = np.zeros(data.shape[0])  # Fill the entire column with zeros
            else:
                feature_mean = np.nanmean(data[:, j])  # Compute mean ignoring NaNs
                data_filled[:, j] = np.nan_to_num(data[:, j], nan=feature_mean)  # Replace NaNs with the mean of the column

    X_cleaned.append(data_filled)

# Step 2: Remove constant features
X_filtered = []
subject_id_list_filtered = []
for i, data in enumerate(X_cleaned):
    subject_ID = subject_id_list_ori[i]
    non_constant_features = data[:, data.std(axis=0) != 0]
    if non_constant_features.shape[1] < data.shape[1]:
        print(f"Removed constant features for Subject {i + 1}.")
    if non_constant_features.size == data.size: #only append if non constant features
		#non_constant_features.size > 0:  # Only append if there are remaining features
        X_filtered.append(non_constant_features)
        subject_id_list_filtered.append(subject_ID)

# Step 3: Final check for any remaining NaNs
X_final = []
for i, data in enumerate(X_filtered):
    if np.isnan(data).any():
        print(f"Subject {i + 1} still has NaN values after filtering. Filling remaining NaNs with zeros.")
        data_filled = np.nan_to_num(data, nan=0)  # Fill any remaining NaNs with zeros
        X_final.append(data_filled)
    else:
        X_final.append(data)

# Ensure consistency in the number of features
if not all(data.shape[1] == X_final[0].shape[1] for data in X_final):
    raise ValueError("All subjects must have the same number of features before standardization.")

# Step 3: Standardize data
cad_X_standardized = [StandardScaler().fit_transform(data) for data in X_final]

# Initialize ConnectivityMeasure
correlation_measure = ConnectivityMeasure(kind='correlation')
# Fit and transform to compute connectivity matrices
connectivity_matrices = correlation_measure.fit_transform(cad_X_standardized)
# Use numpy to get the upper triangle of each connectivity matrix
num_subjects = connectivity_matrices.shape[0]
num_nodes = connectivity_matrices.shape[1]

# Get upper triangle indices (excluding diagonal)
upper_tri_indices = np.triu_indices(num_nodes, k=1)

# Flatten and store the upper triangle values in the desired shape
cad_upper_triangle_flattened = np.empty((num_subjects, len(upper_tri_indices[0])))

# Extract upper triangle values for each subject
for i in range(num_subjects):
    cad_upper_triangle_flattened[i] = connectivity_matrices[i][upper_tri_indices]

# Check the shape of the result
print(cad_upper_triangle_flattened.shape) 

sample_size = cad_upper_triangle_flattened.shape[1]
df_CAD = df_participants.loc[df_participants['eid'].isin(subject_id_list_filtered)]
#hospital_current_anxiety is the label for classification
## after data clean, end up with 451 [454] CAD and 416[417] controls

X_cad = cad_upper_triangle_flattened  # Feature matrix
y_cad = df_CAD['hospital_current_anxiety']  # Target variable (e.g., symptom scores)

Missing files for subject 4391134, session 2.
Column 18 in Subject 718 has all NaN values. Filling with zeros.
Column 56 in Subject 718 has all NaN values. Filling with zeros.
Column 57 in Subject 718 has all NaN values. Filling with zeros.
Column 79 in Subject 718 has all NaN values. Filling with zeros.
Column 91 in Subject 718 has all NaN values. Filling with zeros.
Column 101 in Subject 718 has all NaN values. Filling with zeros.
Column 112 in Subject 718 has all NaN values. Filling with zeros.
Column 154 in Subject 718 has all NaN values. Filling with zeros.
Removed constant features for Subject 17.
Removed constant features for Subject 59.
Removed constant features for Subject 79.
Removed constant features for Subject 139.
Removed constant features for Subject 153.
Removed constant features for Subject 167.
Removed constant features for Subject 170.
Removed constant features for Subject 244.
Removed constant features for Subject 246.
Removed constant features for Subject 258.
Remo

In [4]:
user_dir = '/Users/xiaoqianxiao'
projectName = 'UKB'
data_dir = os.path.join(user_dir, projectName, "data")
derivatives_dir = os.path.join(data_dir, 'derivatives')
fMRIinfo_file_path = os.path.join(data_dir, 'past_anxiety_data_set.csv')
df_fMRIinfo = pd.read_csv(fMRIinfo_file_path)
participant_file_path = os.path.join(data_dir, 'participants_fMRI.csv')
df_participants = pd.read_csv(participant_file_path)
#subject_IDs = participants_df['eid']
subject_IDs = df_fMRIinfo['eid'].unique()
#for each subject:
#subject_ID = subject_IDs[3]
session_ID = 2
#load timeseries
#session_ID in range(2,4):
# Initialize lists to hold the data
X = []  # To hold the time series data
subject_id_list_ori = []  # To hold the subject IDs

# Loop through the list of subject IDs
for subject_ID_ori in subject_IDs:
    df_sub_session = pd.DataFrame()  # Initialize an empty DataFrame for each subject
    cortical_file_name = f"sub-{subject_ID_ori}_ses-{session_ID}_task-rest_space-Glasser.csv.gz"
    cortical_file_path = os.path.join(derivatives_dir, 'timeseries/past_anxiety_data_set', cortical_file_name)
    subcortical_file_name = f"sub-{subject_ID_ori}_ses-{session_ID}_task-rest_space-Tian_Subcortex_S2_3T.csv.gz"
    subcortical_file_path = os.path.join(derivatives_dir, 'timeseries/past_anxiety_data_set',subcortical_file_name)
    if os.path.exists(cortical_file_path) and os.path.exists(subcortical_file_path):
        # Load the data file and concatenate to the subject's data
        ## cortical ROIs
        df_cortical_all = pd.read_csv(cortical_file_path, compression='gzip', index_col=0, header=0)
        ##subcortical ROIs
        df_subcortical_all = pd.read_csv(subcortical_file_path, compression='gzip', index_col=0, header=0)
        # gather all ROIs into one dataframe
        df_all = pd.concat([df_cortical_all.transpose(), df_subcortical_all.transpose()], axis=1)
        # Append combined data and subject ID to respective lists
        X.append(df_all.values)  # Store the time-series matrix
        subject_id_list_ori.append(subject_ID_ori)
    else:
        print(f"Missing files for subject {subject_ID_ori}, session {session_ID}.")

X_cleaned = []
# Step 1: Handle NaN values by filling them with the mean of each feature
for i, data in enumerate(X):
    # Skip empty arrays
    if data.size == 0:
        print(f"Subject {i + 1} has an empty array. Skipping.")
        continue

    if np.all(np.isnan(data)):  # Check if all values are NaN
        print(f"Subject {i + 1} has all NaN values. Filling with zeros.")
        data_filled = np.zeros_like(data)
        X_cleaned.append(data_filled)
        continue

    # Fill NaNs column-wise with the mean of each feature
    data_filled = np.copy(data)  # Make a copy of the data to modify
    for j in range(data.shape[1]):  # Iterate over each feature (column)
        if np.isnan(data[:, j]).any():  # If the column contains NaNs
            if np.all(np.isnan(data[:, j])):  # If all values in the column are NaN
                print(f"Column {j} in Subject {i + 1} has all NaN values. Filling with zeros.")
                data_filled[:, j] = np.zeros(data.shape[0])  # Fill the entire column with zeros
            else:
                feature_mean = np.nanmean(data[:, j])  # Compute mean ignoring NaNs
                data_filled[:, j] = np.nan_to_num(data[:, j], nan=feature_mean)  # Replace NaNs with the mean of the column

    X_cleaned.append(data_filled)

# Step 2: Remove constant features
X_filtered = []
subject_id_list_filtered = []
for i, data in enumerate(X_cleaned):
    subject_ID = subject_id_list_ori[i]
    non_constant_features = data[:, data.std(axis=0) != 0]
    if non_constant_features.shape[1] < data.shape[1]:
        print(f"Removed constant features for Subject {i + 1}.")
    if non_constant_features.size == data.size: #only append if non constant features
		#non_constant_features.size > 0:  # Only append if there are remaining features
        X_filtered.append(non_constant_features)
        subject_id_list_filtered.append(subject_ID)

# Step 3: Final check for any remaining NaNs
X_final = []
for i, data in enumerate(X_filtered):
    if np.isnan(data).any():
        print(f"Subject {i + 1} still has NaN values after filtering. Filling remaining NaNs with zeros.")
        data_filled = np.nan_to_num(data, nan=0)  # Fill any remaining NaNs with zeros
        X_final.append(data_filled)
    else:
        X_final.append(data)

# Ensure consistency in the number of features
if not all(data.shape[1] == X_final[0].shape[1] for data in X_final):
    raise ValueError("All subjects must have the same number of features before standardization.")

# Step 3: Standardize data
pad_X_standardized = [StandardScaler().fit_transform(data) for data in X_final]

# Initialize ConnectivityMeasure
correlation_measure = ConnectivityMeasure(kind='correlation')
# Fit and transform to compute connectivity matrices
connectivity_matrices = correlation_measure.fit_transform(pad_X_standardized)
# Use numpy to get the upper triangle of each connectivity matrix
num_subjects = connectivity_matrices.shape[0]
num_nodes = connectivity_matrices.shape[1]

# Get upper triangle indices (excluding diagonal)
upper_tri_indices = np.triu_indices(num_nodes, k=1)

# Flatten and store the upper triangle values in the desired shape
pad_upper_triangle_flattened = np.empty((num_subjects, len(upper_tri_indices[0])))

# Extract upper triangle values for each subject
for i in range(num_subjects):
    pad_upper_triangle_flattened[i] = connectivity_matrices[i][upper_tri_indices]

# Check the shape of the result
print(pad_upper_triangle_flattened.shape) 

sample_size = pad_upper_triangle_flattened.shape[1]
df_PAD = df_participants.loc[df_participants['eid'].isin(subject_id_list_filtered)]
X_pad = pad_upper_triangle_flattened  # Feature matrix
y_pad = y = df_PAD['hospital_not_now']  # Target variable (e.g., symptom scores)

Missing files for subject 5971030, session 2.
Missing files for subject 3569403, session 2.
Missing files for subject 2587534, session 2.
Missing files for subject 2375956, session 2.
Column 237 in Subject 245 has all NaN values. Filling with zeros.
Column 11 in Subject 280 has all NaN values. Filling with zeros.
Column 79 in Subject 280 has all NaN values. Filling with zeros.
Column 94 in Subject 280 has all NaN values. Filling with zeros.
Column 113 in Subject 280 has all NaN values. Filling with zeros.
Column 120 in Subject 280 has all NaN values. Filling with zeros.
Column 123 in Subject 280 has all NaN values. Filling with zeros.
Column 165 in Subject 280 has all NaN values. Filling with zeros.
Column 237 in Subject 320 has all NaN values. Filling with zeros.
Column 237 in Subject 515 has all NaN values. Filling with zeros.
Removed constant features for Subject 13.
Removed constant features for Subject 14.
Removed constant features for Subject 75.
Removed constant features for Sub

In [5]:
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.svm import SVC
from sklearn.feature_selection import RFECV
from sklearn.metrics import accuracy_score

# Step 1: Split data into training (80%) and testing (20%)
def split_data(X, y, test_size=0.2, random_state=42):
    return train_test_split(X, y, test_size=test_size, random_state=random_state)

# Step 2: Feature selection using RFECV with SVM
def feature_selection_with_rfecv(X_train, y_train, step=1, cv=10):
    svm = SVC(kernel='linear')  # Use linear kernel for RFECV
    rfecv = RFECV(estimator=svm, step=step, cv=cv, scoring='accuracy', n_jobs=-1)
    rfecv.fit(X_train, y_train)
    selected_features = rfecv.support_  # Boolean mask of selected features
    print(f"Number of selected features: {sum(selected_features)}")
    return selected_features

# Step 3: Hyperparameter tuning with 10-fold CV
def tune_svm_hyperparameters(X_train, y_train):
    param_grid = {
        'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],  # Regularization parameter
        'kernel': ['linear', 'rbf', 'poly'],        # Explore different kernels
        'gamma': ['scale', 'auto', 0.01, 0.1, 1],  # Kernel coefficient for rbf/poly
        'tol': [1e-4, 1e-3, 1e-2],                 # Tolerance for stopping criteria
        'degree': [2, 3, 4]                        # Degree for polynomial kernel
    }
    grid_search = GridSearchCV(
        SVC(),
        param_grid,
        scoring='accuracy',
        cv=10,
        verbose=1,
        n_jobs=-1
    )
    grid_search.fit(X_train, y_train)
    print(f"Best parameters: {grid_search.best_params_}")
    mean_cv_score = grid_search.best_score_
    print(f"Cross-Validation Accuracy (Training): {mean_cv_score:.4f}")
    return grid_search.best_estimator_, mean_cv_score

# Step 4: Train Final Model on Entire Training Data and Report Accuracy
def train_and_evaluate_final_model(X_train_selected, y_train, X_test_selected, y_test, best_params):
    """
    Trains the final SVM model on the full training dataset using the best hyperparameters
    and evaluates it on the test dataset.
    
    Args:
        X_train_selected (ndarray): Training data with selected features.
        y_train (ndarray): Labels for the training data.
        X_test_selected (ndarray): Test data with selected features.
        y_test (ndarray): Labels for the test data.
        best_params (dict): Best parameters identified during hyperparameter tuning.
        
    Returns:
        final_model: Trained SVM model.
        test_accuracy: Accuracy of the model on the test dataset.
    """
    # Train the final model on the full training dataset
    final_model = SVC(**best_params)
    final_model.fit(X_train_selected, y_train)

    # Evaluate the model on the test dataset
    test_accuracy = final_model.score(X_test_selected, y_test)
    print(f"Test Set Accuracy (Final Model): {test_accuracy:.4f}")

    return final_model, test_accuracy

# Step 5: Evaluate the model on the test set
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Test set accuracy: {accuracy}")
    return accuracy

# Main pipeline
def pipeline(X, y):
    # 1. Split data
    X_train, X_test, y_train, y_test = split_data(X, y)

    # 2. Feature selection
    selected_features = feature_selection_with_rfecv(X_train, y_train)
    X_train_selected = X_train[:, selected_features]
    X_test_selected = X_test[:, selected_features]

    # 3. Hyperparameter tuning
    best_model, mean_cv_score = tune_svm_hyperparameters(X_train_selected, y_train)

    # 4. Train final model and evaluate on the test set
    final_model, test_accuracy = train_and_evaluate_final_model(
        X_train_selected, y_train, X_test_selected, y_test, best_model.get_params()
    )

    return final_model, selected_features, test_accuracy

# Apply the pipeline for both datasets
print("Pipeline for CAD dataset:")
final_model_cad, selected_features_cad, test_accuracy_cad = pipeline(X_cad, y_cad)

print("\nPipeline for PAD dataset:")
final_model_pad, selected_features_pad, test_accuracy_pad = pipeline(X_pad, y_pad)


Pipeline for CAD dataset:


KeyboardInterrupt: 

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, Ridge, Lasso, ElasticNet, Perceptron
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.metrics.pairwise import cosine_similarity

# Step 1: Split data into training and testing sets
def split_data(X, y, test_size=0.2, random_state=42):
    return train_test_split(X, y, test_size=test_size, random_state=random_state)

# Step 2: Feature selection using RFE with n selected features and 10-fold cross-validation
def feature_selection_with_rfe_cv(X_train, y_train, n):
    """
    Perform feature selection using Recursive Feature Elimination (RFE).
    
    Parameters:
    - X_train: Training feature set
    - y_train: Training labels
    - n: Number of features to select

    Returns:
    - selected_features: A boolean mask indicating selected features
    """
    svm = SVC(kernel='linear')  # Use linear SVM as the base model for RFE
    rfe = RFE(estimator=svm, n_features_to_select=n, step=1)
    rfe.fit(X_train, y_train)
    selected_features = rfe.support_

    # Ensure at least one feature is selected
    if np.sum(selected_features) == 0:
        print("No features selected. Using all features as fallback.")
        selected_features = np.ones(X_train.shape[1], dtype=bool)

    return selected_features

# Step 3: Model selection using cross-validation
def model_selection(X_train, y_train):
    models = {
        "Logistic Regression": LogisticRegression(),
        "Ridge Classifier": LogisticRegression(penalty='l2', solver='liblinear'),
        "Lasso (L1)": LogisticRegression(penalty='l1', solver='liblinear'),
        "ElasticNet (L1+L2)": LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.5),
        "LDA": LinearDiscriminantAnalysis(),
        "Perceptron": Perceptron(),
        "SVM (Linear)": SVC(kernel='linear'),
        "Random Forest": RandomForestClassifier()
    }

    best_model = None
    best_score = -np.inf
    best_name = ""

    for model_name, model in models.items():
        cv_score = cross_val_score(model, X_train, y_train, cv=10, scoring='accuracy').mean()
        print(f"Model: {model_name}, CV Score: {cv_score:.4f}")

        if cv_score > best_score:
            best_score = cv_score
            best_model = model
            best_name = model_name

    print(f"Best Model: {best_name} with CV score: {best_score:.4f}")
    return best_model

# Step 4: Two-step grid search for hyperparameter optimization
def tune_model_hyperparameters(best_model, X_train, y_train):
    if isinstance(best_model, SVC):  # Example for SVC
        # Broad grid search
        param_grid = {
            'C': [0.001, 0.01, 0.1, 1, 10, 100],
            'kernel': ['linear'],
            'gamma': ['scale', 'auto']
        }
        grid_search = GridSearchCV(best_model, param_grid, scoring='accuracy', cv=10, verbose=1, n_jobs=-1)
        grid_search.fit(X_train, y_train)

        # Narrow search around best parameters
        best_params = grid_search.best_params_
        refined_grid = {
            'C': np.linspace(best_params['C'] * 0.1, best_params['C'] * 10, 5),
            'kernel': ['linear'],
            'gamma': ['scale', 'auto']
        }
        refined_search = GridSearchCV(best_model, refined_grid, scoring='accuracy', cv=10, verbose=1, n_jobs=-1)
        refined_search.fit(X_train, y_train)

        print(f"Best SVM Parameters (Refined): {refined_search.best_params_}")
        return refined_search.best_estimator_
    else:
        best_model.fit(X_train, y_train)
        return best_model

# Step 5: Train final model and evaluate with test data
def train_and_evaluate_final_model(X_train, y_train, X_test, y_test, model):
    model.fit(X_train, y_train)
    test_accuracy = model.score(X_test, y_test)
    print(f"Test Set Accuracy (Final Model): {test_accuracy:.4f}")
    return model, test_accuracy

def ensure_binary_target(y):
    """
    Ensure the target variable is binary (0 and 1).
    
    Parameters:
    - y: Target variable (numpy array)

    Returns:
    - y_binary: Binary target variable (0 and 1)
    """
    unique_values = np.unique(y)
    if len(unique_values) > 2:
        raise ValueError("Target variable contains more than two classes. Please preprocess the data.")
    if unique_values.dtype == bool:
        # Convert boolean to integers
        return y.astype(int)
    elif np.array_equal(unique_values, [0, 1]) or np.array_equal(unique_values, [1, 0]):
        # Already binary
        return y
    else:
        raise ValueError("Target variable is not binary. Please preprocess the data.")
    
# Step 6: Calculate cosine similarity between two sets of model weights
def calculate_cosine_similarity(model1, model2):
    """
    Calculate the cosine similarity between two model weights.
    
    Parameters:
    - model1: First trained model
    - model2: Second trained model
    
    Returns:
    - similarity: Cosine similarity score between the two model weights
    """
    # Extract the weights (coefficients) of the models
    if hasattr(model1, 'coef_') and hasattr(model2, 'coef_'):
        weights1 = model1.coef_.flatten()
        weights2 = model2.coef_.flatten()
        similarity = cosine_similarity([weights1], [weights2])
        return similarity[0][0]
    else:
        raise ValueError("Models do not have coefficients. Cosine similarity cannot be computed.")

# Main pipeline with integration for CAD and PAD comparison
def pipeline(X, y, n_features):
    # Ensure target variable is binary
    y = ensure_binary_target(y)

    # Split data
    X_train, X_test, y_train, y_test = split_data(X, y)
    print(f"After split_data, unique values in y_train: {np.unique(y_train)}")
    print(f"After split_data, unique values in y_test: {np.unique(y_test)}")

    # Feature selection
    selected_features = feature_selection_with_rfe_cv(X_train, y_train, n_features)
    X_train_selected = X_train[:, selected_features]
    X_test_selected = X_test[:, selected_features]
    print(f"After feature selection, unique values in y_train: {np.unique(y_train)}")

    # Model selection
    best_model = model_selection(X_train_selected, y_train)
    if best_model is None:
        raise ValueError("No valid model was selected. Check your model selection process.")

    # Hyperparameter tuning
    tuned_model = tune_model_hyperparameters(best_model, X_train_selected, y_train)

    # Train and evaluate final model
    final_model, test_accuracy = train_and_evaluate_final_model(
        X_train_selected, y_train, X_test_selected, y_test, tuned_model
    )

    return final_model, selected_features, test_accuracy

# Step 7: Comparison between CAD and PAD models
def compare_models_and_analyze_topography(X_cad, y_cad, X_pad, y_pad, n_features):
    # Train the CAD model and save selected features
    print("Training CAD model...")
    final_model_cad, selected_features_cad, test_accuracy_cad = pipeline(X_cad, y_cad, n_features)

    # Train the PAD model and save selected features
    print("\nTraining PAD model...")
    final_model_pad, selected_features_pad, test_accuracy_pad = pipeline(X_pad, y_pad, n_features)

    # Ensure the selected features from CAD are used in PAD model evaluation
    print("\nEvaluating CAD model on PAD dataset:")
    X_pad_selected = X_pad[:, selected_features_cad]  # Apply CAD-selected features to PAD dataset
    y_pred_pad = final_model_cad.predict(X_pad_selected)
    accuracy = accuracy_score(y_pad, y_pred_pad)
    print(f"Accuracy of CAD model on PAD data: {accuracy:.4f}")

    # Confusion matrix and classification report
    print("Confusion Matrix:\n", confusion_matrix(y_pad, y_pred_pad))
    print("Classification Report:\n", classification_report(y_pad, y_pred_pad))

    # Step 8: Calculate Cosine Similarity between CAD and PAD model weights
    print("\nCalculating cosine similarity between CAD and PAD model weights:")
    similarity = calculate_cosine_similarity(final_model_cad, final_model_pad)
    
    return accuracy, similarity

# Example usage with CAD and PAD datasets
n_features = 20  # Number of features to select
print("Pipeline for CAD dataset:")
final_model_cad, selected_features_cad, test_accuracy_cad = pipeline(X_cad, y_cad, n_features)

print("\nPipeline for PAD dataset:")
final_model_pad, selected_features_pad, test_accuracy_pad = pipeline(X_pad, y_pad, n_features)

print("\nComparing CAD and PAD models:")
accuracy, similarity = compare_models_and_analyze_topography(
    X_cad, y_cad, X_pad, y_pad, n_features
)

print(f"\nCAD model accuracy on PAD data: {accuracy:.4f}")
print(f"Cosine similarity between CAD and PAD model weights: {similarity:.4f}")