In [4]:
import pandas as pd

# Load dataset
df = pd.read_csv('feature_selection.csv')
df.head()


Unnamed: 0,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,Feature_9,Feature_10,...,Feature_12,Feature_13,Feature_14,Feature_15,Feature_16,Feature_17,Feature_18,Feature_19,Feature_20,Target
0,1.470848,-0.36045,-0.591602,-0.728228,0.94169,1.065964,0.017832,-0.596184,1.840712,-1.497093,...,-0.603968,2.899256,0.037567,-1.249523,0.257963,0.416628,1.408208,-1.838041,-0.833142,1
1,4.513369,-2.227103,-1.140747,2.018263,-2.238358,-0.49737,0.71455,0.938883,-2.395169,0.159837,...,1.461499,3.954171,0.309054,0.538184,-7.157865,-4.532216,-0.0818,-9.325362,0.574386,1
2,-2.355643,2.218601,-1.603269,0.873394,0.401483,0.717264,-0.859399,-1.04219,-2.175965,0.980231,...,0.544434,-2.466258,-0.470256,0.073018,-2.203531,-2.299263,-1.742761,-0.271579,-0.359285,0
3,-1.596198,-0.857427,1.772434,-0.639361,1.419409,-0.438525,0.281949,2.345145,1.00623,0.389135,...,-1.025051,-2.422975,1.579807,-0.300713,4.26712,2.893775,1.236697,6.034785,-0.045711,0
4,2.840049,-2.4896,-0.844902,-1.594362,-4.688517,0.459637,0.913607,-1.143505,1.263937,-2.040928,...,4.176424,1.341742,0.133565,1.743819,1.531188,2.269808,0.053489,-3.151109,1.603702,0


Define Helper Functions


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Fitness function: Evaluate model performance
def fitness_function(individual, X, y):
    selected_features = [feature for feature, include in zip(X.columns, individual) if include == 1]
    if len(selected_features) == 0:
        return 0
    X_selected = X[selected_features]
    X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred)


Initialize the Population

In [6]:
import numpy as np

def initialize_population(pop_size, num_features):
    population = np.random.randint(2, size=(pop_size, num_features))
    return population

pop_size = 50
num_features = df.shape[1] - 1  # Exclude the target column
population = initialize_population(pop_size, num_features)


Define Genetic Algorithm Operations

In [7]:
# Selection: Select the best-performing individuals
def selection(population, fitness_scores, num_parents):
    parents = np.empty((num_parents, population.shape[1]))
    for i in range(num_parents):
        max_fitness_idx = np.where(fitness_scores == np.max(fitness_scores))
        max_fitness_idx = max_fitness_idx[0][0]
        parents[i, :] = population[max_fitness_idx, :]
        fitness_scores[max_fitness_idx] = -999999  # Avoid selecting the same individual
    return parents

# Crossover: Combine pairs of parents to create offspring
def crossover(parents, offspring_size):
    offspring = np.empty(offspring_size)
    crossover_point = np.uint8(offspring_size[1] / 2)

    for k in range(offspring_size[0]):
        parent1_idx = k % parents.shape[0]
        parent2_idx = (k + 1) % parents.shape[0]
        offspring[k, 0:crossover_point] = parents[parent1_idx, 0:crossover_point]
        offspring[k, crossover_point:] = parents[parent2_idx, crossover_point:]
    return offspring

# Mutation: Introduce random changes to some individuals
def mutation(offspring, mutation_rate=0.01):
    for idx in range(offspring.shape[0]):
        for gene_idx in range(offspring.shape[1]):
            if np.random.rand() < mutation_rate:
                offspring[idx, gene_idx] = 1 - offspring[idx, gene_idx]
    return offspring


Run the Genetic Algorithm

In [10]:
def genetic_algorithm(X, y, num_generations, pop_size, num_parents, mutation_rate):
    num_features = X.shape[1]
    population = initialize_population(pop_size, num_features)
    for generation in range(num_generations):
        fitness_scores = np.array([fitness_function(individual, X, y) for individual in population])
        parents = selection(population, fitness_scores, num_parents)
        offspring_size = (pop_size - parents.shape[0], num_features)
        offspring = crossover(parents, offspring_size)
        offspring = mutation(offspring, mutation_rate)
        population[0:parents.shape[0], :] = parents
        population[parents.shape[0]:, :] = offspring
        best_fitness = np.max(fitness_scores)
        print(f"Generation {generation}: Best Fitness = {best_fitness}")
    return population, fitness_scores

X = df.drop(columns='Target')
y = df['Target']

num_generations = 30
num_parents = 10
population, fitness_scores = genetic_algorithm(X, y, num_generations, pop_size, num_parents, mutation_rate=0.01)


Generation 0: Best Fitness = 0.88
Generation 1: Best Fitness = 0.91
Generation 2: Best Fitness = 0.92
Generation 3: Best Fitness = 0.93
Generation 4: Best Fitness = 0.93
Generation 5: Best Fitness = 0.93
Generation 6: Best Fitness = 0.935
Generation 7: Best Fitness = 0.935
Generation 8: Best Fitness = 0.935
Generation 9: Best Fitness = 0.94
Generation 10: Best Fitness = 0.94
Generation 11: Best Fitness = 0.94
Generation 12: Best Fitness = 0.94
Generation 13: Best Fitness = 0.94
Generation 14: Best Fitness = 0.94
Generation 15: Best Fitness = 0.94
Generation 16: Best Fitness = 0.94
Generation 17: Best Fitness = 0.94
Generation 18: Best Fitness = 0.94
Generation 19: Best Fitness = 0.94
Generation 20: Best Fitness = 0.94
Generation 21: Best Fitness = 0.94
Generation 22: Best Fitness = 0.94
Generation 23: Best Fitness = 0.94
Generation 24: Best Fitness = 0.94
Generation 25: Best Fitness = 0.94
Generation 26: Best Fitness = 0.94
Generation 27: Best Fitness = 0.94
Generation 28: Best Fitness

Evaluate the best feature set

In [11]:
best_idx = np.argmax(fitness_scores)
best_individual = population[best_idx, :]
selected_features = [feature for feature, include in zip(X.columns, best_individual) if include == 1]

print(f"Selected Features: {selected_features}")

# Evaluate performance using the best feature subset
X_selected = X[selected_features]
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(f"Accuracy with selected features: {accuracy_score(y_test, y_pred)}")


Selected Features: ['Feature_1', 'Feature_2', 'Feature_3', 'Feature_4', 'Feature_6', 'Feature_7', 'Feature_8', 'Feature_9', 'Feature_11', 'Feature_13', 'Feature_15', 'Feature_17', 'Feature_19', 'Feature_20']
Accuracy with selected features: 0.91
