In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

def target_encode(X, y, column):

    if column not in X.columns:
        raise ValueError(f"Column '{column}' not found in feature set X")

    X[column] = X[column].fillna('Missing')
    temp_df = pd.concat([X, y], axis=1)

    mean_encoding = temp_df.groupby(column)[y.name].mean()

    X[column] = X[column].map(mean_encoding)
    return X

def load_data(filepath, target_column):
    data = pd.read_csv(filepath)
    print(target_column)
    data.columns = data.columns.str.strip().str.replace(r'\s+', '', regex=True)

    print("First few rows of the dataset:\n", data.head())
    print("\nColumn names:", data.columns)

    if 'Unnamed: 0' in data.columns:
        data = data.drop(columns=['Unnamed: 0'])

    if target_column not in data.columns:
        raise ValueError(f"Target column '{target_column}' not found in the dataset.")

    X = data.drop(target_column, axis=1)
    y = data[target_column]

    categorical_columns = X.select_dtypes(include=['object', 'category']).columns

    for column in categorical_columns:
        X = target_encode(X, y, column)

    return train_test_split(X, y, test_size=0.2, random_state=42), X.columns.tolist()

filepath = 'your csv file link'
target_column = 'your target class'
(X_train, X_test, y_train, y_test), feature_names = load_data(filepath, target_column)


X_train.to_csv('processed_X_train.csv', index=False)
X_test.to_csv('processed_X_test.csv', index=False)
y_train.to_csv('processed_y_train.csv', index=False)
y_test.to_csv('processed_y_test.csv', index=False)


In [None]:
import pandas as pd
import random
from deap import base, creator, tools
import numpy as np


if not hasattr(creator, "FitnessMax"):
    creator.create("FitnessMax", base.Fitness, weights=(1.0,))
if not hasattr(creator, "Individual"):
    creator.create("Individual", list, fitness=creator.FitnessMax)

def get_correlation_scores(X_train, y_train):
    correlations = X_train.corrwith(y_train).abs()
    correlations = correlations / correlations.max()
    return correlations

def evaluate(individual, X_train, y_train, feature_names, correlation_scores):
    selected_features = [feature_names[i] for i in range(len(individual)) if individual[i] == 1]

    if len(selected_features) < 2:
        return 0,  # Low score if fewer features are selected

    # Calculate correlation-based fitness
    selected_corr = correlation_scores[selected_features].sum()


    X_selected = X_train[selected_features]
    inter_correlation = X_selected.corr().abs().values
    penalty = inter_correlation[np.triu_indices(len(selected_features), k=1)].mean()

    fitness = selected_corr - penalty
    return fitness,


def genetic_feature_selection(X_train, y_train, feature_names, n_gen=50, pop_size=200, cxpb=0.7, mutpb=0.3):
    correlation_scores = get_correlation_scores(X_train, y_train)

    toolbox = base.Toolbox()
    toolbox.register("attr_bool", random.randint, 0, 1)
    toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=len(feature_names))
    toolbox.register("population", tools.initRepeat, list, toolbox.individual)

    toolbox.register("evaluate", evaluate, X_train=X_train, y_train=y_train, feature_names=feature_names, correlation_scores=correlation_scores)
    toolbox.register("mate", tools.cxTwoPoint)
    toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
    toolbox.register("select", tools.selTournament, tournsize=3)

    population = toolbox.population(n=pop_size)

    for gen in range(n_gen):
        print(f"Generation {gen}")

        offspring = toolbox.select(population, len(population))
        offspring = list(map(toolbox.clone, offspring))

        for child1, child2 in zip(offspring[::2], offspring[1::2]):
            if random.random() < cxpb:
                toolbox.mate(child1, child2)
                del child1.fitness.values
                del child2.fitness.values

        for mutant in offspring:
            if random.random() < mutpb:
                toolbox.mutate(mutant)
                del mutant.fitness.values

        invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
        fitnesses = map(toolbox.evaluate, invalid_ind)
        for ind, fit in zip(invalid_ind, fitnesses):
            ind.fitness.values = fit

        population[:] = offspring

        selected_counts = [sum(ind) for ind in population]
        avg_selected = sum(selected_counts) / len(selected_counts)
        print(f"Average number of features selected: {avg_selected}")

    best_individual = tools.selBest(population, 1)[0]
    selected_features = [feature_names[i] for i in range(len(best_individual)) if best_individual[i] == 1]

    if len(selected_features) == 0:
        print("Warning: No features were selected. Defaulting to the first feature.")
        selected_features = [feature_names[0]]

    return selected_features

X_train = pd.read_csv('processed_X_train.csv').drop(columns=['Unnamed: 0'], errors='ignore')
X_test = pd.read_csv('processed_X_test.csv').drop(columns=['Unnamed: 0'], errors='ignore')
y_train = pd.read_csv('processed_y_train.csv').squeeze()
y_test = pd.read_csv('processed_y_test.csv').squeeze()

print("Running Genetic Algorithm for Feature Selection...")
feature_names = X_train.columns.tolist()
selected_features = genetic_feature_selection(X_train, y_train, feature_names)

print("Selected Features:", selected_features)
