In [None]:
# feature_selection_ga.ipynb
import numpy as np
import random
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# ----- Configuración GA -----
random.seed(42); np.random.seed(42)
POP_SIZE = 30
GENERATIONS = 20
TOURNAMENT_K = 3
CROSSOVER_RATE = 0.9
MUTATION_RATE = 0.02
ELITISM = True

# ----- Datos -----
data = load_breast_cancer()
X, y = data.data, data.target
n_features = X.shape[1]

# ----- Utilidades -----
def ensure_valid(mask):
    # Evita cromosomas con 0 features
    if mask.sum() == 0:
        i = np.random.randint(0, len(mask))
        mask[i] = 1
    return mask

def fitness(mask):
    mask = ensure_valid(mask.copy())
    X_sel = X[:, mask.astype(bool)]
    pipe = Pipeline([
        ("scaler", StandardScaler()),
        ("clf", LogisticRegression(solver="liblinear", max_iter=1000))
    ])
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(pipe, X_sel, y, cv=cv, scoring="accuracy")
    return scores.mean()

def init_individual():
    return ensure_valid(np.random.randint(0, 2, size=n_features).astype(np.int8))

def tournament_select(pop, fits, k=TOURNAMENT_K):
    idxs = np.random.choice(len(pop), size=k, replace=False)
    best = idxs[np.argmax([fits[i] for i in idxs])]
    return pop[best].copy()

def one_point_crossover(a, b):
    if np.random.rand() < CROSSOVER_RATE:
        point = np.random.randint(1, len(a))
        child1 = np.concatenate([a[:point], b[point:]])
        child2 = np.concatenate([b[:point], a[point:]])
        return child1, child2
    return a.copy(), b.copy()

def bit_mutation(ind):
    for i in range(len(ind)):
        if np.random.rand() < MUTATION_RATE:
            ind[i] = 1 - ind[i]
    return ensure_valid(ind)

# ----- GA loop -----
population = [init_individual() for _ in range(POP_SIZE)]
fitnesses = [fitness(ind) for ind in population]

history = []
for gen in range(GENERATIONS):
    new_pop = []

    # Elitismo
    if ELITISM:
        best_idx = int(np.argmax(fitnesses))
        new_pop.append(population[best_idx].copy())

    while len(new_pop) < POP_SIZE:
        p1 = tournament_select(population, fitnesses)
        p2 = tournament_select(population, fitnesses)
        c1, c2 = one_point_crossover(p1, p2)
        c1 = bit_mutation(c1)
        c2 = bit_mutation(c2)
        new_pop.extend([c1, c2])

    population = new_pop[:POP_SIZE]
    fitnesses = [fitness(ind) for ind in population]
    best = float(np.max(fitnesses)); avg = float(np.mean(fitnesses))
    history.append((best, avg))
    print(f"Gen {gen+1:02d} | best={best:.4f} | avg={avg:.4f}")

best_idx = int(np.argmax(fitnesses))
best_mask = population[best_idx]
selected = np.where(best_mask == 1)[0].tolist()
print("Selected features:", selected)
print("Best CV accuracy:", fitnesses[best_idx])
