In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import random

# Load the CSV file
df = pd.read_csv('autoencoder_dataset.csv')

# Separate features and target
X = df.drop(columns=['Label'])
y = df['Label']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Genetic Algorithm Parameters
POP_SIZE = 10        # Population size
GENS = 50            # Number of generations
MUTATION_RATE = 0.1  # Mutation rate
CROSSOVER_RATE = 0.7 # Crossover rate

# Initialize the population with random chromosomes
def init_population(pop_size, n_features):
    population = []
    for _ in range(pop_size):
        chromosome = np.random.choice([0, 1], size=n_features)  # 0 for feature not selected, 1 for selected
        population.append(chromosome)
    return np.array(population)

# Fitness function (SVM classifier)
def fitness_function(chromosome, X_train, y_train, X_test, y_test):
    # Select features based on the chromosome
    selected_features = [i for i in range(len(chromosome)) if chromosome[i] == 1]

    if not selected_features:
        return 0  # Return 0 if no features are selected

    # Train a classifier with the selected features
    X_train_selected = X_train.iloc[:, selected_features]
    X_test_selected = X_test.iloc[:, selected_features]

    model = SVC(kernel='linear')
    model.fit(X_train_selected, y_train)

    # Predict and evaluate
    y_pred = model.predict(X_test_selected)
    return accuracy_score(y_test, y_pred)

# Selection (Tournament selection)
def select_parents(population, X_train, y_train, X_test, y_test):
    # Calculate fitness scores
    fitness_scores = np.array([fitness_function(chromosome, X_train, y_train, X_test, y_test) for chromosome in population])

    # Ensure that the fitness scores sum to 1 for valid probability distribution
    fitness_scores = fitness_scores / fitness_scores.sum()

    # Select parents based on their fitness scores
    parent_indices = np.random.choice(len(population), size=2, p=fitness_scores)

    # Return the selected parents by their indices
    return population[parent_indices[0]], population[parent_indices[1]]

# Crossover (Single-point crossover)
def crossover(parent1, parent2):
    crossover_point = random.randint(1, len(parent1) - 1)
    child1 = np.concatenate([parent1[:crossover_point], parent2[crossover_point:]])
    child2 = np.concatenate([parent2[:crossover_point], parent1[crossover_point:]])
    return child1, child2

# Mutation (Bit-flip mutation)
def mutate(chromosome, mutation_rate):
    mutated_chromosome = chromosome.copy()
    for i in range(len(chromosome)):
        if random.random() < mutation_rate:
            mutated_chromosome[i] = 1 - mutated_chromosome[i]  # Flip the bit
    return mutated_chromosome

# Main Genetic Algorithm loop
def genetic_algorithm(X_train, y_train, X_test, y_test):
    n_features = X_train.shape[1]
    population = init_population(POP_SIZE, n_features)

    for gen in range(GENS):
        new_population = []

        # Selection and Crossover
        for _ in range(POP_SIZE // 2):
            parent1, parent2 = select_parents(population, X_train, y_train, X_test, y_test)
            child1, child2 = crossover(parent1, parent2)
            new_population.extend([child1, child2])

        # Mutation
        population = [mutate(chromosome, MUTATION_RATE) for chromosome in new_population]

        # Evaluate and select the best chromosome from the population
        fitness_scores = [fitness_function(chromosome, X_train, y_train, X_test, y_test) for chromosome in population]
        best_chromosome = population[np.argmax(fitness_scores)]

        # Output the best feature set for the current generation
        print(f"Generation {gen+1}: Best Fitness = {max(fitness_scores)}")

    return best_chromosome

# Run the genetic algorithm
best_chromosome = genetic_algorithm(X_train, y_train, X_test, y_test)

# Print the selected features
selected_features = [i for i in range(len(best_chromosome)) if best_chromosome[i] == 1]
print("Best features selected:", selected_features)


Generation 1: Best Fitness = 0.8
Generation 2: Best Fitness = 0.8
Generation 3: Best Fitness = 0.8
Generation 4: Best Fitness = 0.8
Generation 5: Best Fitness = 0.6
Generation 6: Best Fitness = 0.6
Generation 7: Best Fitness = 0.8
Generation 8: Best Fitness = 0.8
Generation 9: Best Fitness = 0.8
Generation 10: Best Fitness = 1.0
Generation 11: Best Fitness = 0.8
Generation 12: Best Fitness = 0.8
Generation 13: Best Fitness = 1.0
Generation 14: Best Fitness = 0.8
Generation 15: Best Fitness = 0.8
Generation 16: Best Fitness = 0.6
Generation 17: Best Fitness = 0.6
Generation 18: Best Fitness = 0.6
Generation 19: Best Fitness = 0.8
Generation 20: Best Fitness = 1.0
Generation 21: Best Fitness = 1.0
Generation 22: Best Fitness = 1.0
Generation 23: Best Fitness = 0.8
Generation 24: Best Fitness = 0.8
Generation 25: Best Fitness = 0.8
Generation 26: Best Fitness = 0.8
Generation 27: Best Fitness = 0.8
Generation 28: Best Fitness = 0.8
Generation 29: Best Fitness = 0.8
Generation 30: Best Fit

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import random

# Load the CSV file
df = pd.read_csv('autoencoder_dataset.csv')

# Separate features and target
X = df.drop(columns=['Label'])
y = df['Label']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Genetic Algorithm Parameters
POP_SIZE = 10        # Population size
GENS = 50            # Number of generations
MUTATION_RATE = 0.1  # Mutation rate
CROSSOVER_RATE = 0.7 # Crossover rate

# Initialize the population with random chromosomes
def init_population(pop_size, n_features):
    population = []
    for _ in range(pop_size):
        chromosome = np.random.choice([0, 1], size=n_features)  # 0 for feature not selected, 1 for selected
        population.append(chromosome)
    return np.array(population)

# Fitness function (SVM classifier)
def fitness_function(chromosome, X_train, y_train, X_test, y_test):
    # Select features based on the chromosome
    selected_features = [i for i in range(len(chromosome)) if chromosome[i] == 1]

    if not selected_features:
        return 0  # Return 0 if no features are selected

    # Train a classifier with the selected features
    X_train_selected = X_train.iloc[:, selected_features]
    X_test_selected = X_test.iloc[:, selected_features]

    model = SVC(kernel='linear')
    model.fit(X_train_selected, y_train)

    # Predict and evaluate
    y_pred = model.predict(X_test_selected)
    return accuracy_score(y_test, y_pred)

# Selection (Tournament selection)
def select_parents(population, X_train, y_train, X_test, y_test):
    # Calculate fitness scores
    fitness_scores = np.array([fitness_function(chromosome, X_train, y_train, X_test, y_test) for chromosome in population])

    # Ensure that the fitness scores sum to 1 for valid probability distribution
    fitness_scores = fitness_scores / fitness_scores.sum()

    # Select parents based on their fitness scores
    parent_indices = np.random.choice(len(population), size=2, p=fitness_scores)

    # Return the selected parents by their indices
    return population[parent_indices[0]], population[parent_indices[1]]

# Crossover (Single-point crossover)
def crossover(parent1, parent2):
    crossover_point = random.randint(1, len(parent1) - 1)
    child1 = np.concatenate([parent1[:crossover_point], parent2[crossover_point:]])
    child2 = np.concatenate([parent2[:crossover_point], parent1[crossover_point:]])
    return child1, child2

# Mutation (Bit-flip mutation)
def mutate(chromosome, mutation_rate):
    mutated_chromosome = chromosome.copy()
    for i in range(len(chromosome)):
        if random.random() < mutation_rate:
            mutated_chromosome[i] = 1 - mutated_chromosome[i]  # Flip the bit
    return mutated_chromosome

# Main Genetic Algorithm loop
def genetic_algorithm(X_train, y_train, X_test, y_test, output_file="output.csv"):
    n_features = X_train.shape[1]
    population = init_population(POP_SIZE, n_features)

    # List to store generation-wise best features and fitness
    generations_best = []

    for gen in range(GENS):
        new_population = []

        # Selection and Crossover
        for _ in range(POP_SIZE // 2):
            parent1, parent2 = select_parents(population, X_train, y_train, X_test, y_test)
            child1, child2 = crossover(parent1, parent2)
            new_population.extend([child1, child2])

        # Mutation
        population = [mutate(chromosome, MUTATION_RATE) for chromosome in new_population]

        # Evaluate and select the best chromosome from the population
        fitness_scores = [fitness_function(chromosome, X_train, y_train, X_test, y_test) for chromosome in population]
        best_chromosome = population[np.argmax(fitness_scores)]

        # Output the best feature set for the current generation
        best_features = [i for i in range(len(best_chromosome)) if best_chromosome[i] == 1]
        best_fitness = max(fitness_scores)

        generations_best.append({"Generation": gen+1, "Best Fitness": best_fitness, "Selected Features": best_features})

        print(f"Generation {gen+1}: Best Fitness = {best_fitness}")

    # Save the results to a CSV file
    generations_df = pd.DataFrame(generations_best)
    generations_df.to_csv(output_file, index=False)

    # Return the final best features
    return generations_df

# Run the genetic algorithm and save the output
output_file = "genetic_algorithm_output.csv"
generations_df = genetic_algorithm(X_train, y_train, X_test, y_test, output_file)

# Print the final selected features
final_selected_features = generations_df.iloc[-1]["Selected Features"]
print(f"Final selected features: {final_selected_features}")


Generation 1: Best Fitness = 0.8
Generation 2: Best Fitness = 0.8
Generation 3: Best Fitness = 0.6
Generation 4: Best Fitness = 0.6
Generation 5: Best Fitness = 0.6
Generation 6: Best Fitness = 0.6
Generation 7: Best Fitness = 0.8
Generation 8: Best Fitness = 0.8
Generation 9: Best Fitness = 0.6
Generation 10: Best Fitness = 0.6
Generation 11: Best Fitness = 0.8
Generation 12: Best Fitness = 0.6
Generation 13: Best Fitness = 0.8
Generation 14: Best Fitness = 0.8
Generation 15: Best Fitness = 0.8
Generation 16: Best Fitness = 0.6
Generation 17: Best Fitness = 0.6
Generation 18: Best Fitness = 0.8
Generation 19: Best Fitness = 0.8
Generation 20: Best Fitness = 1.0
Generation 21: Best Fitness = 0.8
Generation 22: Best Fitness = 0.6
Generation 23: Best Fitness = 1.0
Generation 24: Best Fitness = 0.8
Generation 25: Best Fitness = 0.6
Generation 26: Best Fitness = 0.6
Generation 27: Best Fitness = 0.8
Generation 28: Best Fitness = 0.8
Generation 29: Best Fitness = 0.8
Generation 30: Best Fit

In [None]:
import pandas as pd
df = pd.read_csv('genetic_algorithm_output.csv')
df.head()

Unnamed: 0,Generation,Best Fitness,Selected Features
0,1,0.8,"[2, 3, 4, 8, 10, 11, 15, 17, 18, 19, 20, 22, 2..."
1,2,0.8,"[1, 3, 4, 6, 10, 12, 15, 17, 18, 19, 21, 24, 2..."
2,3,0.6,"[0, 1, 2, 3, 8, 9, 10, 14, 16, 20, 21, 23, 27,..."
3,4,0.6,"[3, 6, 7, 10, 14, 15, 16, 17, 18, 19, 24, 25, ..."
4,5,0.6,"[1, 2, 3, 4, 6, 7, 8, 9, 10, 14, 15, 16, 19, 2..."


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import random

# Load the CSV file
df = pd.read_csv('autoencoder_dataset.csv')

# Separate features and target
X = df.drop(columns=['Label'])
y = df['Label']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Genetic Algorithm Parameters
POP_SIZE = 10        # Population size
GENS = 50            # Number of generations
MUTATION_RATE = 0.1  # Mutation rate
CROSSOVER_RATE = 0.7 # Crossover rate

# Initialize the population with random chromosomes
def init_population(pop_size, n_features):
    population = []
    for _ in range(pop_size):
        chromosome = np.random.choice([0, 1], size=n_features)
        population.append(chromosome)
    return np.array(population)

# Fitness function (SVM classifier)
def fitness_function(chromosome, X_train, y_train, X_test, y_test):
    selected_features = [i for i in range(len(chromosome)) if chromosome[i] == 1]

    if not selected_features:
        return 0

    X_train_selected = X_train.iloc[:, selected_features]
    X_test_selected = X_test.iloc[:, selected_features]

    model = SVC(kernel='linear')
    model.fit(X_train_selected, y_train)

    y_pred = model.predict(X_test_selected)
    return accuracy_score(y_test, y_pred)

# Selection (Tournament selection)
def select_parents(population, X_train, y_train, X_test, y_test):
    fitness_scores = np.array([fitness_function(chromosome, X_train, y_train, X_test, y_test) for chromosome in population])
    fitness_scores = fitness_scores / fitness_scores.sum()
    parent_indices = np.random.choice(len(population), size=2, p=fitness_scores)
    return population[parent_indices[0]], population[parent_indices[1]]

# Crossover (Single-point crossover)
def crossover(parent1, parent2):
    crossover_point = random.randint(1, len(parent1) - 1)
    child1 = np.concatenate([parent1[:crossover_point], parent2[crossover_point:]])
    child2 = np.concatenate([parent2[:crossover_point], parent1[crossover_point:]])
    return child1, child2

# Mutation (Bit-flip mutation)
def mutate(chromosome, mutation_rate):
    mutated_chromosome = chromosome.copy()
    for i in range(len(chromosome)):
        if random.random() < mutation_rate:
            mutated_chromosome[i] = 1 - mutated_chromosome[i]
    return mutated_chromosome

# Main Genetic Algorithm loop with CSV output
def genetic_algorithm(X_train, y_train, X_test, y_test):
    n_features = X_train.shape[1]
    population = init_population(POP_SIZE, n_features)

    # Initialize lists to store results
    results = []

    for gen in range(GENS):
        new_population = []

        # Selection and Crossover
        for _ in range(POP_SIZE // 2):
            parent1, parent2 = select_parents(population, X_train, y_train, X_test, y_test)
            child1, child2 = crossover(parent1, parent2)
            new_population.extend([child1, child2])

        # Mutation
        population = [mutate(chromosome, MUTATION_RATE) for chromosome in new_population]

        # Evaluate and select the best chromosome from the population
        fitness_scores = [fitness_function(chromosome, X_train, y_train, X_test, y_test) for chromosome in population]
        best_idx = np.argmax(fitness_scores)
        best_chromosome = population[best_idx]
        best_fitness = fitness_scores[best_idx]

        # Get selected features for this generation
        selected_features = [i for i in range(len(best_chromosome)) if best_chromosome[i] == 1]

        # Store results for this generation
        results.append({
            'Generation': gen + 1,
            'Best_Fitness': best_fitness,
            'Selected_Features': str(selected_features),
            'Num_Selected_Features': len(selected_features)
        })

        print(f"Generation {gen+1}: Best Fitness = {best_fitness}")

    # Create DataFrame and save to CSV
    results_df = pd.DataFrame(results)
    results_df.to_csv('genetic_algorithm_results.csv', index=False)

    return best_chromosome, results_df

# Run the genetic algorithm
best_chromosome, results_df = genetic_algorithm(X_train, y_train, X_test, y_test)

# Print final results
print("\nFinal Results:")
print(results_df.head())
print("\nResults have been saved to 'genetic_algorithm_results.csv'")

# Calculate and save final feature importance summary
final_selected_features = [i for i in range(len(best_chromosome)) if best_chromosome[i] == 1]
feature_importance_df = pd.DataFrame({
    'Feature_Index': range(len(best_chromosome)),
    'Is_Selected': best_chromosome,
    'Feature_Name': X.columns
})
feature_importance_df.to_csv('feature_importance.csv', index=False)
print("\nFeature importance summary has been saved to 'feature_importance.csv'")

Generation 1: Best Fitness = 0.8
Generation 2: Best Fitness = 0.6
Generation 3: Best Fitness = 0.6
Generation 4: Best Fitness = 0.6
Generation 5: Best Fitness = 0.8
Generation 6: Best Fitness = 0.6
Generation 7: Best Fitness = 0.6
Generation 8: Best Fitness = 0.6
Generation 9: Best Fitness = 0.6
Generation 10: Best Fitness = 0.8
Generation 11: Best Fitness = 0.8
Generation 12: Best Fitness = 0.6
Generation 13: Best Fitness = 0.6
Generation 14: Best Fitness = 0.8
Generation 15: Best Fitness = 0.6
Generation 16: Best Fitness = 0.6
Generation 17: Best Fitness = 0.8
Generation 18: Best Fitness = 0.6
Generation 19: Best Fitness = 0.6
Generation 20: Best Fitness = 0.8
Generation 21: Best Fitness = 0.6
Generation 22: Best Fitness = 0.8
Generation 23: Best Fitness = 0.6
Generation 24: Best Fitness = 0.8
Generation 25: Best Fitness = 0.8
Generation 26: Best Fitness = 0.8
Generation 27: Best Fitness = 0.8
Generation 28: Best Fitness = 0.8
Generation 29: Best Fitness = 0.8
Generation 30: Best Fit

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import random

# Load the CSV file
df = pd.read_csv('autoencoder_dataset.csv')

# Separate features and target
X = df.drop(columns=['Label'])
y = df['Label']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Genetic Algorithm Parameters
POP_SIZE = 10        # Population size
GENS = 50            # Number of generations
MUTATION_RATE = 0.1  # Mutation rate
CROSSOVER_RATE = 0.7 # Crossover rate

def init_population(pop_size, n_features):
    population = []
    for _ in range(pop_size):
        chromosome = np.random.choice([0, 1], size=n_features)
        population.append(chromosome)
    return np.array(population)

def fitness_function(chromosome, X_train, y_train, X_test, y_test):
    selected_features = [i for i in range(len(chromosome)) if chromosome[i] == 1]

    if not selected_features:
        return 0

    X_train_selected = X_train.iloc[:, selected_features]
    X_test_selected = X_test.iloc[:, selected_features]

    model = SVC(kernel='linear')
    model.fit(X_train_selected, y_train)

    y_pred = model.predict(X_test_selected)
    return accuracy_score(y_test, y_pred)

def select_parents(population, X_train, y_train, X_test, y_test):
    fitness_scores = np.array([fitness_function(chromosome, X_train, y_train, X_test, y_test) for chromosome in population])
    fitness_scores = fitness_scores / fitness_scores.sum()
    parent_indices = np.random.choice(len(population), size=2, p=fitness_scores)
    return population[parent_indices[0]], population[parent_indices[1]]

def crossover(parent1, parent2):
    crossover_point = random.randint(1, len(parent1) - 1)
    child1 = np.concatenate([parent1[:crossover_point], parent2[crossover_point:]])
    child2 = np.concatenate([parent2[:crossover_point], parent1[crossover_point:]])
    return child1, child2

def mutate(chromosome, mutation_rate):
    mutated_chromosome = chromosome.copy()
    for i in range(len(chromosome)):
        if random.random() < mutation_rate:
            mutated_chromosome[i] = 1 - mutated_chromosome[i]
    return mutated_chromosome

def genetic_algorithm(X_train, y_train, X_test, y_test):
    n_features = X_train.shape[1]
    population = init_population(POP_SIZE, n_features)

    best_overall_fitness = 0
    best_overall_chromosome = None

    for gen in range(GENS):
        new_population = []

        for _ in range(POP_SIZE // 2):
            parent1, parent2 = select_parents(population, X_train, y_train, X_test, y_test)
            child1, child2 = crossover(parent1, parent2)
            new_population.extend([child1, child2])

        population = [mutate(chromosome, MUTATION_RATE) for chromosome in new_population]

        # Evaluate population
        fitness_scores = [fitness_function(chromosome, X_train, y_train, X_test, y_test) for chromosome in population]
        best_idx = np.argmax(fitness_scores)
        current_best_fitness = fitness_scores[best_idx]

        if current_best_fitness > best_overall_fitness:
            best_overall_fitness = current_best_fitness
            best_overall_chromosome = population[best_idx]

        print(f"Generation {gen+1}: Best Fitness = {current_best_fitness:.4f}")

    return best_overall_chromosome

# Run the genetic algorithm
best_chromosome = genetic_algorithm(X_train, y_train, X_test, y_test)

# Get selected features
selected_features = [i for i in range(len(best_chromosome)) if best_chromosome[i] == 1]

# Create new dataset with only selected features
selected_columns = X.columns[selected_features].tolist()

# Create the final dataset with selected features and Label
final_dataset = pd.concat([
    df[selected_columns],
    df['Label']
], axis=1)

# Save the results
print("\nSelected Features:", selected_features)
print("\nNumber of selected features:", len(selected_features))
print("\nFirst few rows of the optimized dataset:")
print(final_dataset.head())

# Save the optimized dataset
final_dataset.to_csv('optimized_dataset.csv', index=False)

# Save feature selection details
feature_details = pd.DataFrame({
    'Feature': X.columns,
    'Selected': best_chromosome,
    'Feature_Index': range(len(best_chromosome))
})
feature_details.to_csv('feature_selection_details.csv', index=False)

# Create a summary of the genetic algorithm results
summary_df = pd.DataFrame({
    'Metric': ['Total Original Features', 'Selected Features', 'Features Removed', 'Selected Feature Indices'],
    'Value': [
        len(X.columns),
        len(selected_features),
        len(X.columns) - len(selected_features),
        str(selected_features)
    ]
})
summary_df.to_csv('optimization_summary.csv', index=False)

Generation 1: Best Fitness = 0.6000
Generation 2: Best Fitness = 0.8000
Generation 3: Best Fitness = 1.0000
Generation 4: Best Fitness = 0.8000
Generation 5: Best Fitness = 0.8000
Generation 6: Best Fitness = 0.8000
Generation 7: Best Fitness = 0.8000
Generation 8: Best Fitness = 0.8000
Generation 9: Best Fitness = 0.8000
Generation 10: Best Fitness = 0.8000
Generation 11: Best Fitness = 0.8000
Generation 12: Best Fitness = 0.6000
Generation 13: Best Fitness = 0.8000
Generation 14: Best Fitness = 0.8000
Generation 15: Best Fitness = 0.8000
Generation 16: Best Fitness = 0.8000
Generation 17: Best Fitness = 0.6000
Generation 18: Best Fitness = 0.6000
Generation 19: Best Fitness = 0.6000
Generation 20: Best Fitness = 0.8000
Generation 21: Best Fitness = 0.8000
Generation 22: Best Fitness = 0.8000
Generation 23: Best Fitness = 0.8000
Generation 24: Best Fitness = 0.6000
Generation 25: Best Fitness = 0.8000
Generation 26: Best Fitness = 0.8000
Generation 27: Best Fitness = 0.6000
Generation

In [None]:
import pandas as pd
df = pd.read_csv('optimized_dataset.csv')
df.head()

Unnamed: 0,encoded_5,encoded_8,encoded_10,encoded_11,encoded_12,encoded_14,encoded_20,encoded_21,encoded_22,encoded_25,...,encoded_86,encoded_88,encoded_89,encoded_90,encoded_91,encoded_96,encoded_97,encoded_99,encoded_100,Label
0,0.0,0.0,0.0,0.0,0.0,971.72577,0.0,0.0,0.0,586.6567,...,0.0,0.0,0.0,0.0,0.0,0.0,1678.4856,929.9107,0.0,cancerous
1,0.0,0.0,0.0,0.0,0.0,0.0,1118.9606,0.0,0.0,0.0,...,693.40106,0.0,638.24316,748.67413,0.0,1265.4187,0.0,0.0,0.0,cancerous
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2278.2708,0.0,0.0,...,0.0,514.9104,0.0,0.0,0.0,0.0,0.0,0.0,0.0,cancerous
3,256.93192,460.3955,0.0,0.0,481.3307,0.0,394.6675,0.0,10.721924,0.0,...,0.0,20.788239,0.0,0.0,2.25569,0.0,0.0,0.0,0.0,cancerous
4,0.0,99.23599,148.24594,0.0,0.0,57.042873,146.80266,0.0,158.43515,0.0,...,0.0,0.0,0.0,207.35503,0.0,0.0,0.0,0.0,0.0,cancerous


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import random
from itertools import product

# Load the CSV file
df = pd.read_csv('autoencoder_dataset.csv')

# Separate features and target
X = df.drop(columns=['Label'])
y = df['Label']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

def init_population(pop_size, n_features):
    population = [np.random.choice([0, 1], size=n_features) for _ in range(pop_size)]
    return np.array(population)

def fitness_function(chromosome, X_train, y_train, X_test, y_test):
    selected_features = [i for i in range(len(chromosome)) if chromosome[i] == 1]

    if not selected_features:
        return 0

    X_train_selected = X_train.iloc[:, selected_features]
    X_test_selected = X_test.iloc[:, selected_features]

    model = SVC(kernel='linear')
    model.fit(X_train_selected, y_train)

    y_pred = model.predict(X_test_selected)
    return accuracy_score(y_test, y_pred)

def select_parents(population, X_train, y_train, X_test, y_test):
    fitness_scores = np.array([fitness_function(chromosome, X_train, y_train, X_test, y_test) for chromosome in population])
    fitness_scores = fitness_scores / fitness_scores.sum()
    parent_indices = np.random.choice(len(population), size=2, p=fitness_scores)
    return population[parent_indices[0]], population[parent_indices[1]]

def crossover(parent1, parent2, crossover_rate):
    if random.random() > crossover_rate:
        return parent1.copy(), parent2.copy()
    crossover_point = random.randint(1, len(parent1) - 1)
    child1 = np.concatenate([parent1[:crossover_point], parent2[crossover_point:]])
    child2 = np.concatenate([parent2[:crossover_point], parent1[crossover_point:]])
    return child1, child2

def mutate(chromosome, mutation_rate):
    mutated_chromosome = chromosome.copy()
    for i in range(len(chromosome)):
        if random.random() < mutation_rate:
            mutated_chromosome[i] = 1 - mutated_chromosome[i]
    return mutated_chromosome

def genetic_algorithm(X_train, y_train, X_test, y_test, pop_size, gens, mutation_rate, crossover_rate):
    n_features = X_train.shape[1]
    population = init_population(pop_size, n_features)

    best_overall_fitness = 0
    best_overall_chromosome = None

    for gen in range(gens):
        new_population = []

        for _ in range(pop_size // 2):
            parent1, parent2 = select_parents(population, X_train, y_train, X_test, y_test)
            child1, child2 = crossover(parent1, parent2, crossover_rate)
            new_population.extend([child1, child2])

        population = [mutate(chromosome, mutation_rate) for chromosome in new_population]

        # Evaluate population
        fitness_scores = [fitness_function(chromosome, X_train, y_train, X_test, y_test) for chromosome in population]
        best_idx = np.argmax(fitness_scores)
        current_best_fitness = fitness_scores[best_idx]

        if current_best_fitness > best_overall_fitness:
            best_overall_fitness = current_best_fitness
            best_overall_chromosome = population[best_idx]

    return best_overall_fitness, best_overall_chromosome

# Parameter tuning
param_combinations = list(product(
    [10, 20],          # Population size
    [20, 50],          # Number of generations
    [0.05, 0.1, 0.2],  # Mutation rate
    [0.6, 0.8, 0.9]    # Crossover rate
))

best_global_fitness = 0
best_global_chromosome = None
best_params = None

for params in param_combinations:
    pop_size, gens, mutation_rate, crossover_rate = params
    fitness, chromosome = genetic_algorithm(X_train, y_train, X_test, y_test, pop_size, gens, mutation_rate, crossover_rate)

    if fitness > best_global_fitness:
        best_global_fitness = fitness
        best_global_chromosome = chromosome
        best_params = params

# Get selected features from the best chromosome
selected_features = [i for i in range(len(best_global_chromosome)) if best_global_chromosome[i] == 1]

# Create new dataset with only selected features
selected_columns = X.columns[selected_features].tolist()

# Create the final dataset with selected features and Label
final_dataset = pd.concat([
    df[selected_columns],
    df['Label']
], axis=1)

# Save the results
print("\nBest Parameters (Population Size, Generations, Mutation Rate, Crossover Rate):", best_params)
print("\nBest Fitness Score:", best_global_fitness)
print("\nSelected Features:", selected_features)
print("\nNumber of selected features:", len(selected_features))

# Save the optimized dataset
final_dataset.to_csv('optimized_dataset2.csv', index=False)

# Save feature selection details
feature_details = pd.DataFrame({
    'Feature': X.columns,
    'Selected': best_global_chromosome,
    'Feature_Index': range(len(best_global_chromosome))
})
feature_details.to_csv('feature_selection_details2.csv', index=False)

# Create a summary of the genetic algorithm results
summary_df = pd.DataFrame({
    'Metric': ['Total Original Features', 'Selected Features', 'Features Removed', 'Selected Feature Indices'],
    'Value': [
        len(X.columns),
        len(selected_features),
        len(X.columns) - len(selected_features),
        str(selected_features)
    ]
})
summary_df.to_csv('optimization_summary2.csv', index=False)



Best Parameters (Population Size, Generations, Mutation Rate, Crossover Rate): (10, 20, 0.05, 0.6)

Best Fitness Score: 1.0

Selected Features: [2, 3, 4, 5, 8, 9, 12, 14, 16, 18, 19, 20, 21, 22, 23, 26, 29, 33, 35, 40, 41, 44, 46, 49, 52, 54, 55, 56, 57, 58, 60, 62, 63, 66, 68, 69, 70, 71, 77, 79, 80, 83, 87, 92, 93]

Number of selected features: 45


In [None]:
import pandas as pd
df = pd.read_csv('optimized_dataset2.csv')
df.head()

Unnamed: 0,encoded_3,encoded_4,encoded_5,encoded_6,encoded_9,encoded_10,encoded_13,encoded_15,encoded_17,encoded_19,...,encoded_71,encoded_72,encoded_78,encoded_80,encoded_81,encoded_84,encoded_88,encoded_93,encoded_94,Label
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,cancerous
1,0.0,1315.5032,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1892.3402,0.0,0.0,0.0,0.0,0.0,cancerous
2,1502.5161,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1928.657,0.0,0.0,0.0,514.9104,0.0,0.0,cancerous
3,0.0,0.0,256.93192,0.0,146.5091,0.0,0.0,0.0,313.12375,0.0,...,298.37656,0.0,23.50032,0.0,114.356995,10.897052,20.788239,0.0,115.7502,cancerous
4,0.0,0.0,0.0,0.0,202.07448,148.24594,534.2414,0.0,0.0,63.63102,...,0.0,0.0,0.0,279.44452,145.55267,0.0,0.0,262.94797,0.0,cancerous
