In [9]:
import argparse
import pandas as pd
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, recall_score, accuracy_score
from sklearn.feature_selection import VarianceThreshold
import numpy as np

In [26]:
# Step 1: Variance and Correlation Thresholding
def variance_and_correlation_thresholding(df, var_thresh=0.01, corr_thresh=0.85):
    # Variance Thresholding
    selector = VarianceThreshold(threshold=var_thresh)
    X_var_filtered = selector.fit_transform(df)

    # Correlation Thresholding
    df_var_filtered = pd.DataFrame(X_var_filtered, columns=df.columns[selector.get_support()])
    corr_matrix = df_var_filtered.corr().abs()
    upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > corr_thresh)]
    df_corr_filtered = df_var_filtered.drop(columns=to_drop)
    
    return df_corr_filtered

In [27]:
# Step 2: Apply Lasso for Feature Selection
def lasso_feature_selection(X, y, alpha=0.001):
    lasso = Lasso(alpha=alpha, max_iter=10000)
    lasso.fit(X, y)
    selected_features = X.columns[lasso.coef_ != 0]
    return X[selected_features]

In [28]:
# Step 3: Apply LDA for Dimensionality Reduction and Classification
def lda_with_cross_validation(X, y):
    lda = LDA(n_components=1)
    classifier = LogisticRegression(max_iter=1000)  # Added LogisticRegression here
    
    skf = StratifiedKFold(n_splits=5)
    precision_scores = []
    recall_scores = []
    accuracy_scores = []

    for train_index, test_index in skf.split(X, y):
        X_train_fold, X_test_fold = X.iloc[train_index], X.iloc[test_index]
        y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]

        X_train_lda = lda.fit_transform(X_train_fold, y_train_fold)
        X_test_lda = lda.transform(X_test_fold)
        
        classifier.fit(X_train_lda, y_train_fold)
        y_pred_fold = classifier.predict(X_test_lda)
        
        precision_scores.append(precision_score(y_test_fold, y_pred_fold, average='macro'))
        recall_scores.append(recall_score(y_test_fold, y_pred_fold, average='macro'))
        accuracy_scores.append(classifier.score(X_test_lda, y_test_fold))

    return {
        "precision": np.mean(precision_scores),
        "recall": np.mean(recall_scores),
        "accuracy": np.mean(accuracy_scores)
    }

In [29]:
# Step 4: Compare feature sets
def compare_feature_sets(selected_features_dict):
    print("\nComparing selected features across datasets:")
    for dataset, features in selected_features_dict.items():
        print(f"\nSelected features for {dataset}:")
        print(features)
    
    common_features = set.intersection(*map(set, selected_features_dict.values()))
    print(f"\nCommon selected features across all datasets: {common_features}")


In [30]:
# Main function to combine LASSO + LDA and compare datasets
def main():
    # Files are named explicitly without using paths
    datasets = {
        "original": "synth_seg.csv",
        "resampled": "resampled_data.csv",
        "undersampled": "undersampled_data.csv"
    }
    
    selected_features_dict = {}

    for name, file_path in datasets.items():
        # Load data
        data = pd.read_csv(file_path)
        y = data['decision'].astype(bool)
        X = data.drop(columns=['Subject', 'decision', 'neuropsych_score'], axis=1)
        
        # Step 1: Apply variance and correlation thresholding
        X_filtered = variance_and_correlation_thresholding(X)
        
        # Step 2: Apply Lasso for feature selection
        X_lasso_selected = lasso_feature_selection(X_filtered, y)
        
        # Store selected features
        selected_features_dict[name] = X_lasso_selected.columns.tolist()
        
        # Step 3: Apply LDA and report performance (optional)
        print(f"\nLDA results for {name} dataset:")
        lda_results = lda_with_cross_validation(X_lasso_selected, y)
        print(lda_results)

    # Step 4: Compare selected features across datasets
    compare_feature_sets(selected_features_dict)

if __name__ == "__main__":
    main()


LDA results for original dataset:
{'precision': 0.500110431619759, 'recall': 0.5018386740661176, 'accuracy': 0.668611488014473}

LDA results for resampled dataset:
{'precision': 0.6390247374770823, 'recall': 0.6369897959183675, 'accuracy': 0.6371134020618556}

LDA results for undersampled dataset:
{'precision': 0.542360509115927, 'recall': 0.5394736842105263, 'accuracy': 0.5384384384384384}

Comparing selected features across datasets:

Selected features for original:
['total intracranial', 'left lateral ventricle', 'left inferior lateral ventricle', 'left cerebellum white matter', 'left thalamus', 'left caudate', 'left putamen', 'left pallidum', '3rd ventricle', '4th ventricle', 'left hippocampus', 'left amygdala', 'csf', 'left accumbens area', 'right inferior lateral ventricle', 'right pallidum', 'right hippocampus', 'right amygdala', 'right accumbens area', 'ctx-lh-bankssts', 'ctx-lh-caudalmiddlefrontal', 'ctx-lh-cuneus', 'ctx-lh-entorhinal', 'ctx-lh-inferiorparietal', 'ctx-lh-infe