In [None]:
import numpy as np
import random
import pandas as pd
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# -----------------------------------------------
# Load dataset
# -----------------------------------------------
df = pd.read_csv("SCOA_A4.csv")

# Assuming last column is the target
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

print("‚úÖ Dataset loaded successfully")
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())
print("Target variable:", df.columns[-1])

# -----------------------------------------------
# User inputs
# -----------------------------------------------
POP_SIZE = int(input("Enter population size (e.g. 20): "))
N_GENERATIONS = int(input("Enter number of generations (e.g. 10): "))
MUTATION_RATE = float(input("Enter mutation rate (0.0‚Äì1.0, e.g. 0.2): "))
SELECTION_RATE = float(input("Enter selection rate (0.1‚Äì0.5, e.g. 0.2): "))

# -----------------------------------------------
# GA Helper Functions
# -----------------------------------------------
def create_chromosome():
    # Each chromosome = [max_depth, min_samples_split]
    return [random.randint(1, 20), random.randint(2, 10)]

def fitness(chromosome):
    max_depth, min_samples_split = chromosome
    model = DecisionTreeClassifier(max_depth=max_depth, min_samples_split=min_samples_split, random_state=42)
    scores = cross_val_score(model, X, y, cv=5)
    return scores.mean()

def selection(population, fitnesses, rate):
    num_selected = max(2, int(len(population) * rate))
    idx = np.argsort(fitnesses)[-num_selected:]
    return [population[i] for i in idx]

def crossover(parent1, parent2):
    point = random.randint(1, len(parent1)-1)
    child1 = parent1[:point] + parent2[point:]
    child2 = parent2[:point] + parent1[point:]
    return child1, child2

def mutate(chromosome):
    if random.random() < MUTATION_RATE:
        chromosome[0] = random.randint(1, 20)
    if random.random() < MUTATION_RATE:
        chromosome[1] = random.randint(2, 10)
    return chromosome

# -----------------------------------------------
# Run GA
# -----------------------------------------------
population = [create_chromosome() for _ in range(POP_SIZE)]

for gen in range(N_GENERATIONS):
    fitnesses = [fitness(chromo) for chromo in population]
    best_fitness = max(fitnesses)
    print(f"Generation {gen+1}/{N_GENERATIONS} - Best Fitness: {best_fitness:.4f}")

    parents = selection(population, fitnesses, SELECTION_RATE)

    new_population = []
    while len(new_population) < POP_SIZE:
        p1, p2 = random.sample(parents, 2)
        child1, child2 = crossover(p1, p2)
        new_population.append(mutate(child1))
        if len(new_population) < POP_SIZE:
            new_population.append(mutate(child2))
    population = new_population

# -----------------------------------------------
# Final GA Result
# -----------------------------------------------
fitnesses = [fitness(chromo) for chromo in population]
best_idx = np.argmax(fitnesses)
best_hyperparams = population[best_idx]
print("\nüéØ Best Hyperparameters from GA:", best_hyperparams)
print("üìà Best Cross-Validation Accuracy:", fitnesses[best_idx])

# -----------------------------------------------
# Compare with GridSearchCV
# -----------------------------------------------
param_grid = {
    'max_depth': range(1, 21),
    'min_samples_split': range(2, 11)
}

grid = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid, cv=5)
grid.fit(X, y)

print("\nüß© Best Hyperparameters from GridSearchCV:", grid.best_params_)
print("üìä Best GridSearchCV Accuracy:", grid.best_score_)

# -----------------------------------------------
# Evaluate performance improvement
# -----------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model before optimization (default)
model_default = DecisionTreeClassifier(random_state=42)
model_default.fit(X_train, y_train)
acc_default = accuracy_score(y_test, model_default.predict(X_test))

# Model after GA optimization
model_ga = DecisionTreeClassifier(max_depth=best_hyperparams[0], min_samples_split=best_hyperparams[1], random_state=42)
model_ga.fit(X_train, y_train)
acc_ga = accuracy_score(y_test, model_ga.predict(X_test))

print("\n‚öôÔ∏è Model Performance Comparison:")
print(f"Default Decision Tree Accuracy: {acc_default:.4f}")
print(f"GA Optimized Decision Tree Accuracy: {acc_ga:.4f}")
print(f"Improvement: {acc_ga - acc_default:.4f}")


‚úÖ Dataset loaded successfully
Shape: (150, 5)
Columns: ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']
Target variable: species
Enter population size (e.g. 20): 20
Enter number of generations (e.g. 10): 10
Enter mutation rate (0.0‚Äì1.0, e.g. 0.2): 0.2
Enter selection rate (0.1‚Äì0.5, e.g. 0.2): 0.2
Generation 1/10 - Best Fitness: 0.9733
Generation 2/10 - Best Fitness: 0.9733
Generation 3/10 - Best Fitness: 0.9733
Generation 4/10 - Best Fitness: 0.9733
Generation 5/10 - Best Fitness: 0.9733
Generation 6/10 - Best Fitness: 0.9733
Generation 7/10 - Best Fitness: 0.9733
Generation 8/10 - Best Fitness: 0.9733
Generation 9/10 - Best Fitness: 0.9733
Generation 10/10 - Best Fitness: 0.9733

üéØ Best Hyperparameters from GA: [3, 2]
üìà Best Cross-Validation Accuracy: 0.9733333333333334

üß© Best Hyperparameters from GridSearchCV: {'max_depth': 3, 'min_samples_split': 2}
üìä Best GridSearchCV Accuracy: 0.9733333333333334

‚öôÔ∏è Model Performance Comparison:
Defa