In [27]:
#pip install deap

In [28]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from deap import base, creator, tools, algorithms
import random

In [29]:
# Genetic + Logistic Regression 
# Load Original dataset
data = pd.read_csv("synth_seg.csv")
y = data['decision'].astype(bool)
X = data.drop(columns=['Subject', 'decision', 'neuropsych_score'], axis=1)

# Number of features
n_features = X.shape[1]

# Check for existing creator classes to avoid redefinition warnings [Use this part only when there is multiple class creation error]
if not hasattr(creator, "FitnessMax"):
    creator.create("FitnessMax", base.Fitness, weights=(1.0,))  # Maximize the accuracy score
if not hasattr(creator, "Individual"):
    creator.create("Individual", list, fitness=creator.FitnessMax)  # Each individual represents a feature subset

# Function to evaluate a feature subset
def evaluate(individual):
    selected_features = [i for i, bit in enumerate(individual) if bit == 1]
    
    if len(selected_features) == 0:
        return 0,  # Avoid selecting no features

    # Select features
    X_selected = X.iloc[:, selected_features]
    
    # Scale the features for better optimization convergence
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_selected)
    
    # Logistic Regression
    classifier = LogisticRegression(max_iter=5000)  # Increased max_iter to 5000
    
    # Cross-validation
    skf = StratifiedKFold(n_splits=5)
    accuracy = cross_val_score(classifier, X_scaled, y, cv=skf, scoring='accuracy').mean()
    
    return accuracy,

# Generate an individual (random feature subset)
def create_individual():
    return [random.randint(0, 1) for _ in range(n_features)]

# Define crossover, mutation, and selection
toolbox = base.Toolbox()
toolbox.register("individual", tools.initIterate, creator.Individual, create_individual)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

# Register the genetic operators
toolbox.register("mate", tools.cxTwoPoint)  # Crossover
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)  # Mutation with 5% probability
toolbox.register("select", tools.selTournament, tournsize=3)  # Selection via tournament
toolbox.register("evaluate", evaluate)

# Run the Genetic Algorithm with the main function
def main():
    random.seed(42)
    pop = toolbox.population(n=50)  # Create a population of 50 individuals
    hof = tools.HallOfFame(1)  # Store the best individual

    # Run the GA
    algorithms.eaSimple(pop, toolbox, cxpb=0.5, mutpb=0.2, ngen=10, halloffame=hof, verbose=True)

    return hof[0]  # Return the best feature subset

best_individual = main()

# Print the best feature subset found
selected_features_lr = [i for i, bit in enumerate(best_individual) if bit == 1]
print(f"Selected Features from Original dataset using Logistic: {X.columns[selected_features_lr].tolist()}")
print(f"Number of features selected: {len(selected_features_lr)}")


gen	nevals
0  	50    
1  	25    
2  	19    
3  	27    
4  	31    
5  	34    
6  	32    
7  	37    
8  	31    
9  	31    
10 	32    
Selected Features from Original dataset using Logistic: ['general csf', 'brainstem', 'thalamus', 'putamen+pallidum', 'total intracranial', 'left cerebral white matter', 'left cerebral cortex', 'left cerebellum cortex', 'left putamen', 'left pallidum', '4th ventricle', 'left hippocampus', 'csf', 'right cerebral white matter', 'right cerebral cortex', 'right lateral ventricle', 'right inferior lateral ventricle', 'right caudate', 'right ventral DC', 'ctx-lh-bankssts', 'ctx-lh-caudalanteriorcingulate', 'ctx-lh-caudalmiddlefrontal', 'ctx-lh-entorhinal', 'ctx-lh-fusiform', 'ctx-lh-isthmuscingulate', 'ctx-lh-lateralorbitofrontal', 'ctx-lh-medialorbitofrontal', 'ctx-lh-paracentral', 'ctx-lh-parstriangularis', 'ctx-lh-pericalcarine', 'ctx-lh-postcentral', 'ctx-lh-posteriorcingulate', 'ctx-lh-precentral', 'ctx-lh-precuneus', 'ctx-lh-rostralanteriorcingulate', 'ctx-

Let's use LDA after logistic (Integrating both Logistic and LDA)

In [30]:
# Genetic + Logistic Regression +LDA
# Load Original dataset
data = pd.read_csv("synth_seg.csv")
y = data['decision'].astype(bool)
X = data.drop(columns=['Subject', 'decision', 'neuropsych_score'], axis=1)

# Number of features
n_features = X.shape[1]

# Check for existing creator classes to avoid redefinition warnings
if not hasattr(creator, "FitnessMax"):
    creator.create("FitnessMax", base.Fitness, weights=(1.0,))  # Maximize the accuracy score
if not hasattr(creator, "Individual"):
    creator.create("Individual", list, fitness=creator.FitnessMax)  # Each individual represents a feature subset

# Function to evaluate a feature subset
def evaluate(individual):
    selected_features = [i for i, bit in enumerate(individual) if bit == 1]
    
    if len(selected_features) == 0:
        return 0,  # Avoid selecting no features

    # Select features
    X_selected = X.iloc[:, selected_features]
    
    # Scale the features for better optimization convergence
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_selected)
    
    # Logistic Regression
    classifier = LogisticRegression(max_iter=5000)  # Increased max_iter to 5000
    
    # Cross-validation for Logistic Regression
    skf = StratifiedKFold(n_splits=5)
    accuracy_lr = cross_val_score(classifier, X_scaled, y, cv=skf, scoring='accuracy').mean()

    # Integrating LDA with logistic to the selected features
    lda = LDA(n_components=1)
    accuracy_lda = cross_val_score(lda, X_scaled, y, cv=skf, scoring='accuracy').mean()

    # Combine both Logistic Regression and LDA accuracy as fitness
    return (accuracy_lr + accuracy_lda) / 2,  # Average the scores from LR and LDA

# Generate an individual (random feature subset)
def create_individual():
    return [random.randint(0, 1) for _ in range(n_features)]

# Define crossover, mutation, and selection
toolbox = base.Toolbox()
toolbox.register("individual", tools.initIterate, creator.Individual, create_individual)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

# Register the genetic operators
toolbox.register("mate", tools.cxTwoPoint)  # Crossover
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)  # Mutation with 5% probability
toolbox.register("select", tools.selTournament, tournsize=3)  # Selection via tournament
toolbox.register("evaluate", evaluate)

# Run the Genetic Algorithm with the main function
def main():
    random.seed(42)
    pop = toolbox.population(n=50)  # Create a population of 50 individuals
    hof = tools.HallOfFame(1)  # Store the best individual

    # Run the GA
    algorithms.eaSimple(pop, toolbox, cxpb=0.5, mutpb=0.2, ngen=10, halloffame=hof, verbose=True)

    return hof[0]  # Return the best feature subset

best_individual = main()

# Print the best feature subset found
selected_features_lr_lda = [i for i, bit in enumerate(best_individual) if bit == 1]
print(f"Selected Features from Original Dataset using Logistic + LDA: {X.columns[selected_features_lr_lda].tolist()}")
print(f"Number of features selected: {len(selected_features_lr_lda)}")

gen	nevals
0  	50    
1  	25    
2  	19    
3  	27    
4  	31    
5  	34    
6  	32    
7  	37    
8  	31    
9  	31    
10 	32    
Selected Features from Original Dataset using Logistic + LDA: ['general white matter', 'general grey matter', 'cerebellum', 'brainstem', 'thalamus', 'hippocampus+amygdala', 'left lateral ventricle', 'left inferior lateral ventricle', 'left cerebellum cortex', 'left thalamus', 'left pallidum', 'left hippocampus', 'left amygdala', 'left ventral DC', 'right inferior lateral ventricle', 'right cerebellum white matter', 'right cerebellum cortex', 'right thalamus', 'right pallidum', 'right hippocampus', 'right amygdala', 'right accumbens area', 'right ventral DC', 'ctx-lh-bankssts', 'ctx-lh-fusiform', 'ctx-lh-inferiorparietal', 'ctx-lh-isthmuscingulate', 'ctx-lh-paracentral', 'ctx-lh-parsorbitalis', 'ctx-lh-posteriorcingulate', 'ctx-lh-precuneus', 'ctx-lh-superiorfrontal', 'ctx-lh-frontalpole', 'ctx-lh-transversetemporal', 'ctx-lh-insula', 'ctx-rh-bankssts', 'ct

In [31]:
# Converting both lists to sets for comparison
set_lr_only = set(selected_features_lr)
set_lr_lda = set(selected_features_lr_lda)

# Features common in both methods
common_features = set_lr_only.intersection(set_lr_lda)

# Features unique to GA + Logistic
unique_lr_only = set_lr_only.difference(set_lr_lda)

# Features unique to GA + Logistic + LDA
unique_lr_lda = set_lr_lda.difference(set_lr_only)

# Print the comparison results
print(f"Common Features: {common_features}")
print(f"Unique to GA + Logistic: {unique_lr_only}")
print(f"Unique to GA + Logistic + LDA: {unique_lr_lda}")

Common Features: {4, 5, 14, 18, 22, 30, 40, 41, 46, 49, 56, 62, 64, 73, 74, 78, 81, 84, 85, 89, 91, 92, 98, 108}
Unique to GA + Logistic: {2, 6, 8, 9, 10, 17, 20, 24, 27, 28, 29, 34, 42, 43, 45, 51, 53, 59, 60, 61, 63, 65, 68, 69, 77, 93, 96, 100, 103, 105}
Unique to GA + Logistic + LDA: {0, 1, 3, 7, 11, 12, 15, 23, 26, 31, 32, 33, 36, 37, 38, 39, 47, 58, 67, 71, 75, 86, 90, 99, 102, 107}


The above results was for our original dataset now let's try the same for Oversampled and undersampled data.

In [32]:
# Genetic + Logistic Regression
# Load OverSampled dataset
data = pd.read_csv("resampled_data.csv")
y = data['decision'].astype(bool)
X = data.drop(columns=['Subject', 'decision', 'neuropsych_score'], axis=1)

# Number of features
n_features = X.shape[1]

# Check for existing creator classes to avoid redefinition warnings [Use this part only when there is multiple class creation error]
if not hasattr(creator, "FitnessMax"):
    creator.create("FitnessMax", base.Fitness, weights=(1.0,))  # Maximize the accuracy score
if not hasattr(creator, "Individual"):
    creator.create("Individual", list, fitness=creator.FitnessMax)  # Each individual represents a feature subset

# Function to evaluate a feature subset
def evaluate(individual):
    selected_features = [i for i, bit in enumerate(individual) if bit == 1]
    
    if len(selected_features) == 0:
        return 0,  # Avoid selecting no features

    # Select features
    X_selected = X.iloc[:, selected_features]
    
    # Scale the features for better optimization convergence
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_selected)
    
    # Logistic Regression
    classifier = LogisticRegression(max_iter=5000)  # Increased max_iter to 5000
    
    # Cross-validation
    skf = StratifiedKFold(n_splits=5)
    accuracy = cross_val_score(classifier, X_scaled, y, cv=skf, scoring='accuracy').mean()
    
    return accuracy,

# Generate an individual (random feature subset)
def create_individual():
    return [random.randint(0, 1) for _ in range(n_features)]

# Define crossover, mutation, and selection
toolbox = base.Toolbox()
toolbox.register("individual", tools.initIterate, creator.Individual, create_individual)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

# Register the genetic operators
toolbox.register("mate", tools.cxTwoPoint)  # Crossover
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)  # Mutation with 5% probability
toolbox.register("select", tools.selTournament, tournsize=3)  # Selection via tournament
toolbox.register("evaluate", evaluate)

# Run the Genetic Algorithm with the main function
def main():
    random.seed(42)
    pop = toolbox.population(n=50)  # Create a population of 50 individuals
    hof = tools.HallOfFame(1)  # Store the best individual

    # Run the GA
    algorithms.eaSimple(pop, toolbox, cxpb=0.5, mutpb=0.2, ngen=10, halloffame=hof, verbose=True)

    return hof[0]  # Return the best feature subset

best_individual = main()

# Print the best feature subset found
selected_features_lr_os = [i for i, bit in enumerate(best_individual) if bit == 1]
print(f"Selected Features From Oversampled Dataset using Logistic: {X.columns[selected_features_lr_os].tolist()}")
print(f"Number of features selected: {len(selected_features_lr_os)}")

gen	nevals
0  	50    
1  	25    
2  	19    
3  	27    
4  	31    
5  	34    
6  	32    
7  	37    
8  	31    
9  	31    
10 	32    
Selected Features From Oversampled Dataset using Logistic: ['general white matter', 'general grey matter', 'general csf', 'cerebellum', 'brainstem', 'thalamus', 'hippocampus+amygdala', 'left cerebral cortex', 'left lateral ventricle', 'left cerebellum white matter', 'left thalamus', 'left caudate', 'left putamen', 'left pallidum', 'left hippocampus', 'left ventral DC', 'right cerebral white matter', 'right cerebral cortex', 'right lateral ventricle', 'right inferior lateral ventricle', 'right thalamus', 'right caudate', 'right pallidum', 'right hippocampus', 'right accumbens area', 'ctx-lh-bankssts', 'ctx-lh-caudalanteriorcingulate', 'ctx-lh-cuneus', 'ctx-lh-fusiform', 'ctx-lh-inferiorparietal', 'ctx-lh-inferiortemporal', 'ctx-lh-lateraloccipital', 'ctx-lh-lateralorbitofrontal', 'ctx-lh-lingual', 'ctx-lh-middletemporal', 'ctx-lh-parsopercularis', 'ctx-lh-p

In [33]:
# Genetic + Logistic Regression +LDA
# Load OverSampled dataset
data = pd.read_csv("resampled_data.csv")
y = data['decision'].astype(bool)
X = data.drop(columns=['Subject', 'decision', 'neuropsych_score'], axis=1)

# Number of features
n_features = X.shape[1]

# Check for existing creator classes to avoid redefinition warnings
if not hasattr(creator, "FitnessMax"):
    creator.create("FitnessMax", base.Fitness, weights=(1.0,))  # Maximize the accuracy score
if not hasattr(creator, "Individual"):
    creator.create("Individual", list, fitness=creator.FitnessMax)  # Each individual represents a feature subset

# Function to evaluate a feature subset
def evaluate(individual):
    selected_features = [i for i, bit in enumerate(individual) if bit == 1]
    
    if len(selected_features) == 0:
        return 0,  # Avoid selecting no features

    # Select features
    X_selected = X.iloc[:, selected_features]
    
    # Scale the features for better optimization convergence
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_selected)
    
    # Logistic Regression
    classifier = LogisticRegression(max_iter=5000)  # Increased max_iter to 5000
    
    # Cross-validation for Logistic Regression
    skf = StratifiedKFold(n_splits=5)
    accuracy_lr = cross_val_score(classifier, X_scaled, y, cv=skf, scoring='accuracy').mean()

    # Integrating LDA with logistic to the selected features
    lda = LDA(n_components=1)
    accuracy_lda = cross_val_score(lda, X_scaled, y, cv=skf, scoring='accuracy').mean()

    # Combine both Logistic Regression and LDA accuracy as fitness
    return (accuracy_lr + accuracy_lda) / 2,  # Average the scores from LR and LDA

# Generate an individual (random feature subset)
def create_individual():
    return [random.randint(0, 1) for _ in range(n_features)]

# Define crossover, mutation, and selection
toolbox = base.Toolbox()
toolbox.register("individual", tools.initIterate, creator.Individual, create_individual)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

# Register the genetic operators
toolbox.register("mate", tools.cxTwoPoint)  # Crossover
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)  # Mutation with 5% probability
toolbox.register("select", tools.selTournament, tournsize=3)  # Selection via tournament
toolbox.register("evaluate", evaluate)

# Run the Genetic Algorithm with the main function
def main():
    random.seed(42)
    pop = toolbox.population(n=50)  # Create a population of 50 individuals
    hof = tools.HallOfFame(1)  # Store the best individual

    # Run the GA
    algorithms.eaSimple(pop, toolbox, cxpb=0.5, mutpb=0.2, ngen=10, halloffame=hof, verbose=True)

    return hof[0]  # Return the best feature subset

best_individual = main()

# Print the best feature subset found
selected_features_lr_lda_os = [i for i, bit in enumerate(best_individual) if bit == 1]
print(f"Selected Features From Oversampled Dataset using Logistic + LDA: {X.columns[selected_features_lr_lda_os].tolist()}")
print(f"Number of features selected: {len(selected_features_lr_lda_os)}")

gen	nevals
0  	50    
1  	25    
2  	19    
3  	27    
4  	31    
5  	34    
6  	32    
7  	37    
8  	31    
9  	31    
10 	32    
Selected Features From Oversampled Dataset using Logistic + LDA: ['general white matter', 'brainstem', 'thalamus', 'putamen+pallidum', 'total intracranial', 'left cerebral white matter', 'left lateral ventricle', 'left inferior lateral ventricle', 'left cerebellum white matter', 'left cerebellum cortex', 'left thalamus', 'left putamen', 'csf', 'left ventral DC', 'right cerebral white matter', 'right cerebral cortex', 'right lateral ventricle', 'right inferior lateral ventricle', 'right thalamus', 'right caudate', 'right pallidum', 'right hippocampus', 'right accumbens area', 'right ventral DC', 'ctx-lh-bankssts', 'ctx-lh-caudalanteriorcingulate', 'ctx-lh-cuneus', 'ctx-lh-entorhinal', 'ctx-lh-fusiform', 'ctx-lh-inferiorparietal', 'ctx-lh-lateraloccipital', 'ctx-lh-lateralorbitofrontal', 'ctx-lh-lingual', 'ctx-lh-middletemporal', 'ctx-lh-parstriangularis', '

In [34]:
# Converting both lists to sets for comparison
set_lr_only_os = set(selected_features_lr_os)
set_lr_lda_os = set(selected_features_lr_lda_os)

# Features common in both methods
common_features_os = set_lr_only_os.intersection(set_lr_lda_os)

# Features unique to GA + Logistic
unique_lr_only_os = set_lr_only_os.difference(set_lr_lda_os)

# Features unique to GA + Logistic + LDA
unique_lr_lda_os = set_lr_lda_os.difference(set_lr_only_os)

# Print the comparison results
print(f"Common Features: {common_features_os}")
print(f"Unique to GA + Logistic: {unique_lr_only_os}")
print(f"Unique to GA + Logistic + LDA: {unique_lr_lda_os}")

Common Features: {0, 4, 5, 11, 13, 15, 17, 26, 27, 28, 29, 30, 33, 34, 36, 37, 39, 41, 42, 44, 46, 47, 50, 51, 52, 54, 60, 62, 63, 65, 66, 67, 69, 70, 71, 72, 74, 75, 77, 81, 82, 85, 86, 89, 91, 93, 94, 95, 100, 103}
Unique to GA + Logistic: {1, 2, 3, 7, 104, 73, 10, 105, 79, 16, 48, 18, 22, 57}
Unique to GA + Logistic + LDA: {64, 96, 97, 98, 6, 8, 9, 40, 106, 12, 45, 14, 76, 24, 59}


Now let's do the same for UNDERSAMPLED data

In [35]:
# Genetic + Logistic Regression
# Load UnderSampled dataset
data = pd.read_csv("undersampled_data.csv")
y = data['decision'].astype(bool)
X = data.drop(columns=['Subject', 'decision', 'neuropsych_score'], axis=1)

# Number of features
n_features = X.shape[1]

# Check for existing creator classes to avoid redefinition warnings [Use this part only when there is multiple class creation error]
if not hasattr(creator, "FitnessMax"):
    creator.create("FitnessMax", base.Fitness, weights=(1.0,))  # Maximize the accuracy score
if not hasattr(creator, "Individual"):
    creator.create("Individual", list, fitness=creator.FitnessMax)  # Each individual represents a feature subset

# Function to evaluate a feature subset
def evaluate(individual):
    selected_features = [i for i, bit in enumerate(individual) if bit == 1]
    
    if len(selected_features) == 0:
        return 0,  # Avoid selecting no features

    # Select features
    X_selected = X.iloc[:, selected_features]
    
    # Scale the features for better optimization convergence
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_selected)
    
    # Logistic Regression
    classifier = LogisticRegression(max_iter=5000)  # Increased max_iter to 5000
    
    # Cross-validation
    skf = StratifiedKFold(n_splits=5)
    accuracy = cross_val_score(classifier, X_scaled, y, cv=skf, scoring='accuracy').mean()
    
    return accuracy,

# Generate an individual (random feature subset)
def create_individual():
    return [random.randint(0, 1) for _ in range(n_features)]

# Define crossover, mutation, and selection
toolbox = base.Toolbox()
toolbox.register("individual", tools.initIterate, creator.Individual, create_individual)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

# Register the genetic operators
toolbox.register("mate", tools.cxTwoPoint)  # Crossover
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)  # Mutation with 5% probability
toolbox.register("select", tools.selTournament, tournsize=3)  # Selection via tournament
toolbox.register("evaluate", evaluate)

# Run the Genetic Algorithm with the main function
def main():
    random.seed(42)
    pop = toolbox.population(n=50)  # Create a population of 50 individuals
    hof = tools.HallOfFame(1)  # Store the best individual

    # Run the GA
    algorithms.eaSimple(pop, toolbox, cxpb=0.5, mutpb=0.2, ngen=10, halloffame=hof, verbose=True)

    return hof[0]  # Return the best feature subset

best_individual = main()

# Print the best feature subset found
selected_features_lr_us = [i for i, bit in enumerate(best_individual) if bit == 1]
print(f"Selected Features From Undersampled Dataset using Logistic : {X.columns[selected_features_lr_us].tolist()}")
print(f"Number of features selected: {len(selected_features_lr_us)}")

gen	nevals
0  	50    
1  	25    
2  	19    
3  	27    
4  	31    
5  	34    
6  	32    
7  	37    
8  	31    
9  	31    
10 	32    
Selected Features From Undersampled Dataset using Logistic : ['general white matter', 'general grey matter', 'brainstem', 'left cerebral white matter', 'left inferior lateral ventricle', 'left cerebellum white matter', 'left cerebellum cortex', 'left thalamus', 'left caudate', 'left putamen', 'left pallidum', '4th ventricle', 'brain-stem', 'left ventral DC', 'right lateral ventricle', 'right cerebellum white matter', 'right cerebellum cortex', 'right caudate', 'right pallidum', 'right ventral DC', 'ctx-lh-bankssts', 'ctx-lh-caudalanteriorcingulate', 'ctx-lh-caudalmiddlefrontal', 'ctx-lh-entorhinal', 'ctx-lh-fusiform', 'ctx-lh-inferiorparietal', 'ctx-lh-lateraloccipital', 'ctx-lh-lingual', 'ctx-lh-middletemporal', 'ctx-lh-parsopercularis', 'ctx-lh-precentral', 'ctx-lh-precuneus', 'ctx-lh-rostralanteriorcingulate', 'ctx-lh-rostralmiddlefrontal', 'ctx-lh-supe

In [36]:
# Genetic + Logistic Regression +LDA
# Load UnderSampled dataset
data = pd.read_csv("undersampled_data.csv")
y = data['decision'].astype(bool)
X = data.drop(columns=['Subject', 'decision', 'neuropsych_score'], axis=1)

# Number of features
n_features = X.shape[1]

# Check for existing creator classes to avoid redefinition warnings
if not hasattr(creator, "FitnessMax"):
    creator.create("FitnessMax", base.Fitness, weights=(1.0,))  # Maximize the accuracy score
if not hasattr(creator, "Individual"):
    creator.create("Individual", list, fitness=creator.FitnessMax)  # Each individual represents a feature subset

# Function to evaluate a feature subset
def evaluate(individual):
    selected_features = [i for i, bit in enumerate(individual) if bit == 1]
    
    if len(selected_features) == 0:
        return 0,  # Avoid selecting no features

    # Select features
    X_selected = X.iloc[:, selected_features]
    
    # Scale the features for better optimization convergence
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_selected)
    
    # Logistic Regression
    classifier = LogisticRegression(max_iter=5000)  # Increased max_iter to 5000
    
    # Cross-validation for Logistic Regression
    skf = StratifiedKFold(n_splits=5)
    accuracy_lr = cross_val_score(classifier, X_scaled, y, cv=skf, scoring='accuracy').mean()

    # Integrating LDA with logistic to the selected features
    lda = LDA(n_components=1)
    accuracy_lda = cross_val_score(lda, X_scaled, y, cv=skf, scoring='accuracy').mean()

    # Combine both Logistic Regression and LDA accuracy as fitness
    return (accuracy_lr + accuracy_lda) / 2,  # Average the scores from LR and LDA

# Generate an individual (random feature subset)
def create_individual():
    return [random.randint(0, 1) for _ in range(n_features)]

# Define crossover, mutation, and selection
toolbox = base.Toolbox()
toolbox.register("individual", tools.initIterate, creator.Individual, create_individual)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

# Register the genetic operators
toolbox.register("mate", tools.cxTwoPoint)  # Crossover
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)  # Mutation with 5% probability
toolbox.register("select", tools.selTournament, tournsize=3)  # Selection via tournament
toolbox.register("evaluate", evaluate)

# Run the Genetic Algorithm with the main function
def main():
    random.seed(42)
    pop = toolbox.population(n=50)  # Create a population of 50 individuals
    hof = tools.HallOfFame(1)  # Store the best individual

    # Run the GA
    algorithms.eaSimple(pop, toolbox, cxpb=0.5, mutpb=0.2, ngen=10, halloffame=hof, verbose=True)

    return hof[0]  # Return the best feature subset

best_individual = main()

# Print the best feature subset found
selected_features_lr_lda_us = [i for i, bit in enumerate(best_individual) if bit == 1]
print(f"Selected Features From Undersampled Dataset using Logistic + LDA: {X.columns[selected_features_lr_lda_us].tolist()}")
print(f"Number of features selected: {len(selected_features_lr_lda_us)}")

gen	nevals
0  	50    
1  	25    
2  	19    
3  	27    
4  	31    
5  	34    
6  	32    
7  	37    
8  	31    
9  	31    
10 	32    
Selected Features From Undersampled Dataset using Logistic + LDA: ['general grey matter', 'brainstem', 'left cerebral white matter', 'left inferior lateral ventricle', 'left cerebellum white matter', 'left cerebellum cortex', 'left thalamus', 'left caudate', 'left putamen', 'left pallidum', '4th ventricle', 'left ventral DC', 'right cerebral cortex', 'right lateral ventricle', 'right cerebellum white matter', 'right cerebellum cortex', 'right thalamus', 'right pallidum', 'right ventral DC', 'ctx-lh-bankssts', 'ctx-lh-caudalmiddlefrontal', 'ctx-lh-cuneus', 'ctx-lh-lateraloccipital', 'ctx-lh-lingual', 'ctx-lh-paracentral', 'ctx-lh-parsopercularis', 'ctx-lh-parstriangularis', 'ctx-lh-pericalcarine', 'ctx-lh-posteriorcingulate', 'ctx-lh-precentral', 'ctx-lh-precuneus', 'ctx-lh-rostralanteriorcingulate', 'ctx-lh-rostralmiddlefrontal', 'ctx-lh-transversetemporal

In [37]:
# Converting both lists to sets for comparison
set_lr_only_us = set(selected_features_lr_us)
set_lr_lda_us = set(selected_features_lr_lda_us)

# Features common in both methods
common_features_us = set_lr_only_us.intersection(set_lr_lda_us)

# Features unique to GA + Logistic
unique_lr_only_us = set_lr_only_us.difference(set_lr_lda_us)

# Features unique to GA + Logistic + LDA
unique_lr_lda_us = set_lr_lda_us.difference(set_lr_only_us)

# Print the comparison results
print(f"Common Features: {common_features_us}")
print(f"Unique to GA + Logistic: {unique_lr_only_us}")
print(f"Unique to GA + Logistic + LDA: {unique_lr_lda_us}")

Common Features: {1, 4, 9, 12, 13, 14, 15, 16, 17, 18, 20, 26, 29, 31, 32, 36, 40, 41, 43, 50, 52, 57, 63, 64, 65, 66, 73, 81, 82, 95, 98, 99, 103, 104, 108}
Unique to GA + Logistic: {0, 21, 34, 42, 45, 46, 47, 54, 67, 69, 70, 71, 72, 74, 75, 77, 83, 91, 94, 101, 102}
Unique to GA + Logistic + LDA: {96, 33, 97, 107, 44, 60, 78, 84, 85, 86, 56, 89, 59, 28, 62}
