In [2]:
import random
import numpy as np
import pandas as pd
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler

In [6]:
#Let's run Genetic Algorithm for the Original Dataset
# Load and prepare data 
data = pd.read_csv("synth_seg.csv")
y = data['decision'].astype(bool)
X = data.drop(columns=['Subject', 'decision', 'neuropsych_score'], axis=1)
n_features = X.shape[1]

# Genetic Algorithm Functions From Scratch 
def create_individual(n_features):
#Creates a random individual (feature subset).
    return [random.randint(0, 1) for _ in range(n_features)]

def evaluate(individual, X, y):
#Evaluates the fitness of an individual.
    selected_features = [i for i, bit in enumerate(individual) if bit == 1]
    if len(selected_features) == 0:
        return 0  # Avoid selecting no features

    X_selected = X.iloc[:, selected_features]
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_selected)

    # Apply LDA
    lda = LDA(n_components=1)  # Use 1 component as in lda.py
    X_lda = lda.fit_transform(X_scaled, y)

    # Logistic Regression on LDA-transformed data
    classifier = LogisticRegression(max_iter=5000)
    skf = StratifiedKFold(n_splits=5)
    accuracy_lda = cross_val_score(
        classifier, X_lda, y, cv=skf, scoring='accuracy'
    ).mean()

    return accuracy_lda  # Use only LDA accuracy here

def selection(population, fitnesses, tournsize=3):
# Performs selection.
    selected = []
    for _ in range(len(population)):
        aspirants = random.sample(range(len(population)), tournsize)
        winner = aspirants[
            max(range(tournsize), key=lambda i: fitnesses[aspirants[i]])
        ]
        selected.append(population[winner])
    return selected

def crossover(parent1, parent2):
#Performs crossover at a random point.
    point = random.randint(1, len(parent1) - 1)
    child1 = parent1[:point] + parent2[point:]
    child2 = parent2[:point] + parent1[point:]
    return child1, child2

def mutation(individual, indpb=0.05):
#Performs flip-bit mutation.
    for i in range(len(individual)):
        if random.random() < indpb:
            individual[i] = 1 - individual[i]
    return (individual,)

# Main Genetic Algorithm Call Function

def main(X, y, pop_size=100, ngen=10, cxpb=0.5, mutpb=0.2, top_percent=0.10):
    random.seed(42)

    # Initialize population
    population = [create_individual(n_features) for _ in range(pop_size)]

    # Evaluate initial population
    fitnesses = [evaluate(ind, X, y) for ind in population]

    # Store best individuals and the generation they were found
    best_individuals = []
    best_generations = []

    for gen in range(ngen):
        # Selection
        offspring = selection(population, fitnesses)

        # Crossover
        for i in range(1, len(offspring), 2):
            if random.random() < cxpb:
                offspring[i - 1], offspring[i] = crossover(
                    offspring[i - 1], offspring[i]
                )

        # Mutation
        for i in range(len(offspring)):
            if random.random() < mutpb:
                offspring[i], = mutation(offspring[i])

        # Evaluate offspring
        fitnesses = [evaluate(ind, X, y) for ind in offspring]

        # Replace population with offspring
        population[:] = offspring

        # Store the top individuals and their generation
        num_top = int(pop_size * top_percent)
        top_indices = np.argsort(fitnesses)[-num_top:]
        for i in top_indices:
            best_individuals.append(population[i])
            best_generations.append(gen)  # Store the generation

    return best_individuals, best_generations  # Return both lists

# Run the GA
best_individuals, best_generations = main(X, y, pop_size=100, top_percent=0.10)  # Use top 10%

# Print the final best solution and its generation
best_individual = max(best_individuals, key=lambda ind: evaluate(ind, X, y))
best_index = best_individuals.index(best_individual)
best_generation = best_generations[best_index]

selected_features_lda = [i for i, bit in enumerate(best_individual) if bit == 1]
print(f"Final Best Solution (found at generation {best_generation}):")
print(f"  Selected Features: {X.columns[selected_features_lda].tolist()}")
print(f"  Number of features selected: {len(selected_features_lda)}")

Final Best Solution (found at generation 8):
  Selected Features: ['general white matter', 'general grey matter', 'general csf', 'cerebellum', 'brainstem', 'thalamus', 'putamen+pallidum', 'left cerebral cortex', 'left lateral ventricle', 'left inferior lateral ventricle', 'left cerebellum white matter', 'left thalamus', 'left caudate', 'left putamen', 'left pallidum', '4th ventricle', 'brain-stem', 'left hippocampus', 'csf', 'left accumbens area', 'left ventral DC', 'right lateral ventricle', 'right inferior lateral ventricle', 'right cerebellum cortex', 'right caudate', 'right pallidum', 'right hippocampus', 'right amygdala', 'right accumbens area', 'right ventral DC', 'ctx-lh-bankssts', 'ctx-lh-caudalanteriorcingulate', 'ctx-lh-cuneus', 'ctx-lh-entorhinal', 'ctx-lh-inferiorparietal', 'ctx-lh-isthmuscingulate', 'ctx-lh-lateraloccipital', 'ctx-lh-lateralorbitofrontal', 'ctx-lh-middletemporal', 'ctx-lh-parahippocampal', 'ctx-lh-parstriangularis', 'ctx-lh-posteriorcingulate', 'ctx-lh-pre

In [7]:
#Let's run Genetic Algorithm for the OverSampled Dataset
# Load and prepare data 
data = pd.read_csv("resampled_data.csv")
y = data['decision'].astype(bool)
X = data.drop(columns=['Subject', 'decision', 'neuropsych_score'], axis=1)
n_features = X.shape[1]

# Genetic Algorithm Functions From Scratch 
def create_individual(n_features):
#Creates a random individual (feature subset).
    return [random.randint(0, 1) for _ in range(n_features)]

def evaluate(individual, X, y):
#Evaluates the fitness of an individual.
    selected_features = [i for i, bit in enumerate(individual) if bit == 1]
    if len(selected_features) == 0:
        return 0  # Avoid selecting no features

    X_selected = X.iloc[:, selected_features]
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_selected)

    # Apply LDA
    lda = LDA(n_components=1)  # Use 1 component as in lda.py
    X_lda = lda.fit_transform(X_scaled, y)

    # Logistic Regression on LDA-transformed data
    classifier = LogisticRegression(max_iter=5000)
    skf = StratifiedKFold(n_splits=5)
    accuracy_lda = cross_val_score(
        classifier, X_lda, y, cv=skf, scoring='accuracy'
    ).mean()

    return accuracy_lda  # Use only LDA accuracy here

def selection(population, fitnesses, tournsize=3):
# Performs selection.
    selected = []
    for _ in range(len(population)):
        aspirants = random.sample(range(len(population)), tournsize)
        winner = aspirants[
            max(range(tournsize), key=lambda i: fitnesses[aspirants[i]])
        ]
        selected.append(population[winner])
    return selected

def crossover(parent1, parent2):
#Performs crossover at a random point.
    point = random.randint(1, len(parent1) - 1)
    child1 = parent1[:point] + parent2[point:]
    child2 = parent2[:point] + parent1[point:]
    return child1, child2

def mutation(individual, indpb=0.05):
#Performs flip-bit mutation.
    for i in range(len(individual)):
        if random.random() < indpb:
            individual[i] = 1 - individual[i]
    return (individual,)

# Main Genetic Algorithm Call Function

def main(X, y, pop_size=100, ngen=10, cxpb=0.5, mutpb=0.2, top_percent=0.10):
    random.seed(42)

    # Initialize population
    population = [create_individual(n_features) for _ in range(pop_size)]

    # Evaluate initial population
    fitnesses = [evaluate(ind, X, y) for ind in population]

    # Store best individuals and the generation they were found
    best_individuals = []
    best_generations = []

    for gen in range(ngen):
        # Selection
        offspring = selection(population, fitnesses)

        # Crossover
        for i in range(1, len(offspring), 2):
            if random.random() < cxpb:
                offspring[i - 1], offspring[i] = crossover(
                    offspring[i - 1], offspring[i]
                )

        # Mutation
        for i in range(len(offspring)):
            if random.random() < mutpb:
                offspring[i], = mutation(offspring[i])

        # Evaluate offspring
        fitnesses = [evaluate(ind, X, y) for ind in offspring]

        # Replace population with offspring
        population[:] = offspring

        # Store the top individuals and their generation
        num_top = int(pop_size * top_percent)
        top_indices = np.argsort(fitnesses)[-num_top:]
        for i in top_indices:
            best_individuals.append(population[i])
            best_generations.append(gen)  # Store the generation

    return best_individuals, best_generations  # Return both lists

# Run the GA
best_individuals, best_generations = main(X, y, pop_size=100, top_percent=0.10)  # Use top 10%

# Print the final best solution and its generation
best_individual = max(best_individuals, key=lambda ind: evaluate(ind, X, y))
best_index = best_individuals.index(best_individual)
best_generation = best_generations[best_index]

selected_features_lda = [i for i, bit in enumerate(best_individual) if bit == 1]
print(f"Final Best Solution (found at generation {best_generation}):")
print(f"  Selected Features: {X.columns[selected_features_lda].tolist()}")
print(f"  Number of features selected: {len(selected_features_lda)}")

Final Best Solution (found at generation 6):
  Selected Features: ['general white matter', 'general grey matter', 'general csf', 'cerebellum', 'brainstem', 'thalamus', 'putamen+pallidum', 'total intracranial', 'left cerebral white matter', 'left cerebral cortex', 'left cerebellum white matter', 'left cerebellum cortex', 'left caudate', 'left putamen', '3rd ventricle', '4th ventricle', 'brain-stem', 'left hippocampus', 'right cerebral white matter', 'right cerebral cortex', 'right lateral ventricle', 'right inferior lateral ventricle', 'right thalamus', 'right caudate', 'right putamen', 'right pallidum', 'right hippocampus', 'right accumbens area', 'right ventral DC', 'ctx-lh-bankssts', 'ctx-lh-caudalanteriorcingulate', 'ctx-lh-cuneus', 'ctx-lh-fusiform', 'ctx-lh-inferiorparietal', 'ctx-lh-inferiortemporal', 'ctx-lh-lateraloccipital', 'ctx-lh-lateralorbitofrontal', 'ctx-lh-lingual', 'ctx-lh-middletemporal', 'ctx-lh-parsopercularis', 'ctx-lh-parstriangularis', 'ctx-lh-posteriorcingulate'

In [8]:
#Let's run Genetic Algorithm for the UnderSampled Dataset
# Load and prepare data 
data = pd.read_csv("undersampled_data.csv")
y = data['decision'].astype(bool)
X = data.drop(columns=['Subject', 'decision', 'neuropsych_score'], axis=1)
n_features = X.shape[1]

# Genetic Algorithm Functions From Scratch 
def create_individual(n_features):
#Creates a random individual (feature subset).
    return [random.randint(0, 1) for _ in range(n_features)]

def evaluate(individual, X, y):
#Evaluates the fitness of an individual.
    selected_features = [i for i, bit in enumerate(individual) if bit == 1]
    if len(selected_features) == 0:
        return 0  # Avoid selecting no features

    X_selected = X.iloc[:, selected_features]
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_selected)

    # Apply LDA
    lda = LDA(n_components=1)  # Use 1 component as in lda.py
    X_lda = lda.fit_transform(X_scaled, y)

    # Logistic Regression on LDA-transformed data
    classifier = LogisticRegression(max_iter=5000)
    skf = StratifiedKFold(n_splits=5)
    accuracy_lda = cross_val_score(
        classifier, X_lda, y, cv=skf, scoring='accuracy'
    ).mean()

    return accuracy_lda  # Use only LDA accuracy here

def selection(population, fitnesses, tournsize=3):
# Performs selection.
    selected = []
    for _ in range(len(population)):
        aspirants = random.sample(range(len(population)), tournsize)
        winner = aspirants[
            max(range(tournsize), key=lambda i: fitnesses[aspirants[i]])
        ]
        selected.append(population[winner])
    return selected

def crossover(parent1, parent2):
#Performs crossover at a random point.
    point = random.randint(1, len(parent1) - 1)
    child1 = parent1[:point] + parent2[point:]
    child2 = parent2[:point] + parent1[point:]
    return child1, child2

def mutation(individual, indpb=0.05):
#Performs flip-bit mutation.
    for i in range(len(individual)):
        if random.random() < indpb:
            individual[i] = 1 - individual[i]
    return (individual,)

# Main Genetic Algorithm Call Function

def main(X, y, pop_size=100, ngen=10, cxpb=0.5, mutpb=0.2, top_percent=0.10):
    random.seed(42)

    # Initialize population
    population = [create_individual(n_features) for _ in range(pop_size)]

    # Evaluate initial population
    fitnesses = [evaluate(ind, X, y) for ind in population]

    # Store best individuals and the generation they were found
    best_individuals = []
    best_generations = []

    for gen in range(ngen):
        # Selection
        offspring = selection(population, fitnesses)

        # Crossover
        for i in range(1, len(offspring), 2):
            if random.random() < cxpb:
                offspring[i - 1], offspring[i] = crossover(
                    offspring[i - 1], offspring[i]
                )

        # Mutation
        for i in range(len(offspring)):
            if random.random() < mutpb:
                offspring[i], = mutation(offspring[i])

        # Evaluate offspring
        fitnesses = [evaluate(ind, X, y) for ind in offspring]

        # Replace population with offspring
        population[:] = offspring

        # Store the top individuals and their generation
        num_top = int(pop_size * top_percent)
        top_indices = np.argsort(fitnesses)[-num_top:]
        for i in top_indices:
            best_individuals.append(population[i])
            best_generations.append(gen)  # Store the generation

    return best_individuals, best_generations  # Return both lists

# Run the GA
best_individuals, best_generations = main(X, y, pop_size=100, top_percent=0.10)  # Use top 10%

# Print the final best solution and its generation
best_individual = max(best_individuals, key=lambda ind: evaluate(ind, X, y))
best_index = best_individuals.index(best_individual)
best_generation = best_generations[best_index]

selected_features_lda = [i for i, bit in enumerate(best_individual) if bit == 1]
print(f"Final Best Solution (found at generation {best_generation}):")
print(f"  Selected Features: {X.columns[selected_features_lda].tolist()}")
print(f"  Number of features selected: {len(selected_features_lda)}")

Final Best Solution (found at generation 8):
  Selected Features: ['general csf', 'brainstem', 'thalamus', 'hippocampus+amygdala', 'total intracranial', 'left lateral ventricle', 'left cerebellum white matter', 'left cerebellum cortex', 'left thalamus', 'left caudate', 'left pallidum', '3rd ventricle', 'left hippocampus', 'csf', 'left accumbens area', 'right cerebral white matter', 'right lateral ventricle', 'right inferior lateral ventricle', 'right cerebellum white matter', 'right cerebellum cortex', 'right thalamus', 'right pallidum', 'right hippocampus', 'right amygdala', 'right ventral DC', 'ctx-lh-caudalanteriorcingulate', 'ctx-lh-caudalmiddlefrontal', 'ctx-lh-cuneus', 'ctx-lh-entorhinal', 'ctx-lh-inferiorparietal', 'ctx-lh-parsopercularis', 'ctx-lh-parstriangularis', 'ctx-lh-precentral', 'ctx-lh-precuneus', 'ctx-lh-rostralanteriorcingulate', 'ctx-lh-rostralmiddlefrontal', 'ctx-lh-superiorfrontal', 'ctx-lh-supramarginal', 'ctx-lh-frontalpole', 'ctx-lh-transversetemporal', 'ctx-lh