In [None]:
!pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [None]:
"""
Evolutionary NAS with Preprocessing Search - Breast Cancer Dataset
"""

import copy
import random
from collections import namedtuple

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
from ucimlrepo import fetch_ucirepo

# ----------------------------
# Config / Representation
# ----------------------------
NUM_CLASSES = 2
Arch = list  # list of dicts for layers

def random_preprocessing():
    """Generate random preprocessing configuration"""
    return {
        'scaler': random.choice(['standard', 'minmax', 'robust', 'none']),
        'feature_selection': random.choice(['none', 'pca', 'selectk']),
        'n_features': random.choice([10, 15, 20, 25, 30]),  # for PCA or SelectKBest
        'add_polynomial': random.choice([False, True]),
    }

def random_layer(input_dim, min_units=8, max_units=128):
    return {
        'units': random.choice([8, 16, 32, 64, 128]),
        'activation': random.choice(['relu', 'tanh', 'sigmoid']),
        'dropout': random.choice([0.0, 0.1, 0.2, 0.3, 0.5])
    }

def random_arch(input_dim, min_layers=1, max_layers=4):
    return [random_layer(input_dim) for _ in range(random.randint(min_layers, max_layers))]

def arch_to_str(arch):
    return ' | '.join(f"U{l['units']}{l['activation'][0].upper()}D{l['dropout']}" for l in arch)

def preprocess_to_str(prep):
    feat = f"{prep['feature_selection']}{prep['n_features']}" if prep['feature_selection'] != 'none' else 'all'
    poly = '+poly' if prep['add_polynomial'] else ''
    return f"{prep['scaler']}|{feat}{poly}"

# ----------------------------
# Preprocessing Pipeline
# ----------------------------
class PreprocessingPipeline:
    def __init__(self, config):
        self.config = config
        self.scaler = None
        self.feature_transformer = None
        self.n_output_features = None

    def fit(self, X_train, y_train):
        X = X_train.copy()

        # Scaling
        if self.config['scaler'] == 'standard':
            self.scaler = StandardScaler()
        elif self.config['scaler'] == 'minmax':
            self.scaler = MinMaxScaler()
        elif self.config['scaler'] == 'robust':
            self.scaler = RobustScaler()
        else:
            self.scaler = None

        if self.scaler:
            X = self.scaler.fit_transform(X)

        # Polynomial features
        if self.config['add_polynomial']:
            X_poly = X ** 2
            X = np.hstack([X, X_poly])

        # Feature selection/reduction
        if self.config['feature_selection'] == 'pca':
            n_comp = min(self.config['n_features'], X.shape[1])
            self.feature_transformer = PCA(n_components=n_comp)
            X = self.feature_transformer.fit_transform(X)
        elif self.config['feature_selection'] == 'selectk':
            k = min(self.config['n_features'], X.shape[1])
            self.feature_transformer = SelectKBest(f_classif, k=k)
            X = self.feature_transformer.fit_transform(X, y_train.ravel())

        self.n_output_features = X.shape[1]
        return X

    def transform(self, X):
        X_trans = X.copy()

        if self.scaler:
            X_trans = self.scaler.transform(X_trans)

        if self.config['add_polynomial']:
            X_poly = X_trans ** 2
            X_trans = np.hstack([X_trans, X_poly])

        if self.feature_transformer:
            X_trans = self.feature_transformer.transform(X_trans)

        return X_trans

# ----------------------------
# Model builder
# ----------------------------
class SimpleMLP(nn.Module):
    def __init__(self, arch: Arch, input_dim, num_classes=NUM_CLASSES):
        super(SimpleMLP, self).__init__()
        layers = []
        cur_dim = input_dim

        for layer_config in arch:
            layers.append(nn.Linear(cur_dim, layer_config['units']))

            if layer_config['activation'] == 'relu':
                layers.append(nn.ReLU())
            elif layer_config['activation'] == 'tanh':
                layers.append(nn.Tanh())
            elif layer_config['activation'] == 'sigmoid':
                layers.append(nn.Sigmoid())

            if layer_config['dropout'] > 0:
                layers.append(nn.Dropout(layer_config['dropout']))

            cur_dim = layer_config['units']

        layers.append(nn.Linear(cur_dim, num_classes))
        self.network = nn.Sequential(*layers)

    def forward(self, x):
        return self.network(x)

# ----------------------------
# Weight inheritance helper
# ----------------------------
def try_inherit_weights(child: nn.Module, parent: nn.Module):
    child_dict = child.state_dict()
    parent_dict = parent.state_dict()
    matched = 0
    for k, v in parent_dict.items():
        if k in child_dict and child_dict[k].shape == v.shape:
            child_dict[k] = v.clone()
            matched += 1
    child.load_state_dict(child_dict)
    return matched

# ----------------------------
# Data loading
# ----------------------------
def load_breast_cancer_data():
    breast_cancer = fetch_ucirepo(id=17)
    X = breast_cancer.data.features
    y = breast_cancer.data.targets

    # Convert to numpy arrays
    X = X.values
    y = y.values.ravel()

    # Convert labels to 0 and 1 (M=1, B=0)
    y = (y == 'M').astype(int)

    # Split into train, val, test
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

    return X_train, X_val, X_test, y_train, y_val, y_test

# ----------------------------
# Training utils
# ----------------------------
def train_one_epoch(model, device, X_train, y_train, optimizer, criterion, batch_size=32):
    model.train()
    n_samples = len(X_train)
    indices = np.random.permutation(n_samples)

    running_loss = 0.0
    correct = 0
    total = 0

    for i in range(0, n_samples, batch_size):
        batch_idx = indices[i:i+batch_size]
        X_batch = torch.FloatTensor(X_train[batch_idx]).to(device)
        y_batch = torch.LongTensor(y_train[batch_idx]).to(device)

        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * len(batch_idx)
        _, predicted = outputs.max(1)
        total += len(batch_idx)
        correct += predicted.eq(y_batch).sum().item()

    avg_loss = running_loss / total
    acc = correct / total
    return avg_loss, acc

def evaluate(model, device, X, y, criterion=None, batch_size=32):
    model.eval()
    n_samples = len(X)

    correct = 0
    total = 0
    loss_sum = 0.0

    with torch.no_grad():
        for i in range(0, n_samples, batch_size):
            X_batch = torch.FloatTensor(X[i:i+batch_size]).to(device)
            y_batch = torch.LongTensor(y[i:i+batch_size]).to(device)

            outputs = model(X_batch)
            if criterion:
                loss = criterion(outputs, y_batch)
                loss_sum += loss.item() * len(y_batch)

            _, predicted = outputs.max(1)
            total += len(y_batch)
            correct += predicted.eq(y_batch).sum().item()

    acc = correct / total
    avg_loss = loss_sum / total if criterion else None
    return avg_loss, acc

# ----------------------------
# Evolutionary algorithm
# ----------------------------
Individual = namedtuple('Individual', ['preprocessing', 'arch', 'fitness', 'model_state', 'pipeline'])

def mutate_arch(arch: Arch, max_layers=6):
    new = copy.deepcopy(arch)
    ops = ['add', 'remove', 'modify']
    op = random.choice(ops)

    if op == 'add' and len(new) < max_layers:
        pos = random.randint(0, len(new))
        new.insert(pos, random_layer(0))
    elif op == 'remove' and len(new) > 1:
        pos = random.randrange(len(new))
        new.pop(pos)
    else:
        pos = random.randrange(len(new))
        field = random.choice(['units', 'activation', 'dropout'])
        if field == 'units':
            new[pos]['units'] = random.choice([8, 16, 32, 64, 128])
        elif field == 'activation':
            new[pos]['activation'] = random.choice(['relu', 'tanh', 'sigmoid'])
        else:
            new[pos]['dropout'] = random.choice([0.0, 0.1, 0.2, 0.3, 0.5])

    return new

def mutate_preprocessing(prep):
    new = copy.deepcopy(prep)
    field = random.choice(['scaler', 'feature_selection', 'n_features', 'add_polynomial'])

    if field == 'scaler':
        new['scaler'] = random.choice(['standard', 'minmax', 'robust', 'none'])
    elif field == 'feature_selection':
        new['feature_selection'] = random.choice(['none', 'pca', 'selectk'])
    elif field == 'n_features':
        new['n_features'] = random.choice([10, 15, 20, 25, 30])
    else:
        new['add_polynomial'] = not new['add_polynomial']

    return new

def evolve(population, X_train, y_train, X_val, y_val, device, args):
    population = sorted(population, key=lambda x: x.fitness if x.fitness is not None else 0.0, reverse=True)
    next_pop = []
    K = max(1, int(args.elitism * len(population)))
    next_pop.extend(population[:K])

    while len(next_pop) < args.pop_size:
        tournament = random.sample(population, k=min(args.tournament_k, len(population)))
        parent = max(tournament, key=lambda x: x.fitness if x.fitness is not None else 0.0)

        # Mutate both preprocessing and architecture
        if random.random() < 0.5:
            child_prep = mutate_preprocessing(parent.preprocessing)
            child_arch = parent.arch
        else:
            child_prep = parent.preprocessing
            child_arch = mutate_arch(parent.arch, max_layers=args.max_layers)

        # Apply preprocessing
        pipeline = PreprocessingPipeline(child_prep)
        X_train_proc = pipeline.fit(X_train, y_train)
        X_val_proc = pipeline.transform(X_val)

        # Build and train model
        input_dim = X_train_proc.shape[1]
        child_model = SimpleMLP(child_arch, input_dim).to(device)

        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(child_model.parameters(), lr=args.lr, weight_decay=1e-4)

        for ep in range(args.train_epochs):
            train_one_epoch(child_model, device, X_train_proc, y_train, optimizer, criterion, args.batch_size)

        _, val_acc = evaluate(child_model, device, X_val_proc, y_val, None, args.batch_size)
        child_state = child_model.state_dict()
        child = Individual(preprocessing=child_prep, arch=child_arch, fitness=val_acc,
                         model_state=child_state, pipeline=pipeline)
        next_pop.append(child)

    return next_pop

# ----------------------------
# Main Evolution Run
# ----------------------------
def run_evolution(args):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print("Device:", device)

    # Load data
    X_train, X_val, X_test, y_train, y_val, y_test = load_breast_cancer_data()
    print(f"Data loaded: Train={len(X_train)}, Val={len(X_val)}, Test={len(X_test)}")

    # Initialize population
    population = []
    print("\nInitializing population...")
    for i in range(args.pop_size):
        prep = random_preprocessing()
        arch = random_arch(30, min_layers=args.min_layers, max_layers=args.init_max_layers)

        # Apply preprocessing
        pipeline = PreprocessingPipeline(prep)
        X_train_proc = pipeline.fit(X_train, y_train)
        X_val_proc = pipeline.transform(X_val)

        # Build and train model
        input_dim = X_train_proc.shape[1]
        model = SimpleMLP(arch, input_dim).to(device)
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=1e-4)

        for ep in range(args.init_train_epochs):
            train_one_epoch(model, device, X_train_proc, y_train, optimizer, criterion, args.batch_size)

        _, val_acc = evaluate(model, device, X_val_proc, y_val, None, args.batch_size)
        state = model.state_dict()
        population.append(Individual(preprocessing=prep, arch=arch, fitness=val_acc,
                                    model_state=state, pipeline=pipeline))
        print(f" Init {i+1}/{args.pop_size}: Prep={preprocess_to_str(prep)} | Arch={arch_to_str(arch)} | val_acc={val_acc:.4f}")

    best = None
    for gen in range(1, args.generations + 1):
        print(f"\n=== Generation {gen} ===")
        population = evolve(population, X_train, y_train, X_val, y_val, device, args)
        population = sorted(population, key=lambda x: x.fitness if x.fitness is not None else 0.0, reverse=True)
        best = population[0]
        print(f" Best gen {gen}: Prep={preprocess_to_str(best.preprocessing)} | Arch={arch_to_str(best.arch)} | val_acc={best.fitness:.4f}")

    print("\n=== Final Best ===")
    # Reconstruct best model with best preprocessing
    X_train_proc = best.pipeline.fit(X_train, y_train)
    X_test_proc = best.pipeline.transform(X_test)

    input_dim = X_train_proc.shape[1]
    best_model = SimpleMLP(best.arch, input_dim).to(device)
    best_model.load_state_dict(best.model_state)

    _, test_acc = evaluate(best_model, device, X_test_proc, y_test, None, args.batch_size)
    print(f" Best Preprocessing: {preprocess_to_str(best.preprocessing)}")
    print(f" Best Architecture: {arch_to_str(best.arch)}")
    print(f" Val Acc: {best.fitness:.4f} | Test Acc: {test_acc:.4f}")
    return best

# ----------------------------
# Fixed Arguments
# ----------------------------
class Args:
    pop_size = 6
    generations = 4
    train_epochs = 3
    init_train_epochs = 3
    batch_size = 32
    lr = 0.001
    elitism = 0.3
    tournament_k = 3
    min_layers = 1
    init_max_layers = 3
    max_layers = 5

args = Args()
best_individual = run_evolution(args)

Device: cuda
Data loaded: Train=398, Val=85, Test=86

Initializing population...
 Init 1/6: Prep=standard|all | Arch=U64TD0.5 | U16TD0.2 | U8TD0.2 | val_acc=0.9529
 Init 2/6: Prep=none|all+poly | Arch=U64SD0.3 | U32RD0.5 | U128RD0.3 | val_acc=0.6235
 Init 3/6: Prep=minmax|pca30 | Arch=U16RD0.1 | U64SD0.3 | val_acc=0.6235
 Init 4/6: Prep=standard|pca25 | Arch=U16SD0.3 | val_acc=0.6235
 Init 5/6: Prep=robust|pca10+poly | Arch=U8RD0.5 | val_acc=0.6941
 Init 6/6: Prep=minmax|all+poly | Arch=U16SD0.3 | U16SD0.0 | U32RD0.3 | val_acc=0.6235

=== Generation 1 ===
 Best gen 1: Prep=standard|all | Arch=U64TD0.5 | U16TD0.2 | U8TD0.2 | val_acc=0.9529

=== Generation 2 ===
 Best gen 2: Prep=standard|all | Arch=U64TD0.5 | U16TD0.2 | U8TD0.2 | val_acc=0.9529

=== Generation 3 ===
 Best gen 3: Prep=standard|all | Arch=U64TD0.5 | U64TD0.5 | val_acc=0.9647

=== Generation 4 ===
 Best gen 4: Prep=standard|all | Arch=U64TD0.5 | U64TD0.5 | val_acc=0.9647

=== Final Best ===
 Best Preprocessing: standard|al

## EvoNAS (MLP) -- (Preprocessing, Architecture)

In [None]:
"""
Evolutionary NAS with Preprocessing Search - Breast Cancer Dataset (MLP)
"""

import copy
import random
from collections import namedtuple

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
from ucimlrepo import fetch_ucirepo

# ----------------------------
# Config / Representation
# ----------------------------
NUM_CLASSES = 2
Arch = list  # list of dicts for layers

def random_preprocessing():
    """Generate random preprocessing configuration"""
    return {
        'scaler': random.choice(['standard', 'minmax', 'robust', 'none']),
        'feature_selection': random.choice(['none', 'pca', 'selectk']),
        'n_features': random.choice([10, 15, 20, 25, 30]),  # for PCA or SelectKBest
        'add_polynomial': random.choice([False, True]),
    }

def random_layer(input_dim, min_units=8, max_units=128):
    return {
        'units': random.choice([8, 16, 32, 64, 128]),
        'activation': random.choice(['relu', 'tanh', 'sigmoid']),
        'dropout': random.choice([0.0, 0.1, 0.2, 0.3, 0.5])
    }

def random_arch(input_dim, min_layers=1, max_layers=4):
    return [random_layer(input_dim) for _ in range(random.randint(min_layers, max_layers))]

def arch_to_str(arch):
    return ' | '.join(f"U{l['units']}{l['activation'][0].upper()}D{l['dropout']}" for l in arch)

def preprocess_to_str(prep):
    feat = f"{prep['feature_selection']}{prep['n_features']}" if prep['feature_selection'] != 'none' else 'all'
    poly = '+poly' if prep['add_polynomial'] else ''
    return f"{prep['scaler']}|{feat}{poly}"

# ----------------------------
# Preprocessing Pipeline
# ----------------------------
class PreprocessingPipeline:
    def __init__(self, config):
        self.config = config
        self.scaler = None
        self.feature_transformer = None
        self.n_output_features = None

    def fit(self, X_train, y_train):
        X = X_train.copy()

        # Scaling
        if self.config['scaler'] == 'standard':
            self.scaler = StandardScaler()
        elif self.config['scaler'] == 'minmax':
            self.scaler = MinMaxScaler()
        elif self.config['scaler'] == 'robust':
            self.scaler = RobustScaler()
        else:
            self.scaler = None

        if self.scaler:
            X = self.scaler.fit_transform(X)

        # Polynomial features
        if self.config['add_polynomial']:
            X_poly = X ** 2
            X = np.hstack([X, X_poly])

        # Feature selection/reduction
        if self.config['feature_selection'] == 'pca':
            n_comp = min(self.config['n_features'], X.shape[1])
            self.feature_transformer = PCA(n_components=n_comp)
            X = self.feature_transformer.fit_transform(X)
        elif self.config['feature_selection'] == 'selectk':
            k = min(self.config['n_features'], X.shape[1])
            self.feature_transformer = SelectKBest(f_classif, k=k)
            X = self.feature_transformer.fit_transform(X, y_train.ravel())

        self.n_output_features = X.shape[1]
        return X

    def transform(self, X):
        X_trans = X.copy()

        if self.scaler:
            X_trans = self.scaler.transform(X_trans)

        if self.config['add_polynomial']:
            X_poly = X_trans ** 2
            X_trans = np.hstack([X_trans, X_poly])

        if self.feature_transformer:
            X_trans = self.feature_transformer.transform(X_trans)

        return X_trans

# ----------------------------
# Multi-Layer Perceptron (MLP) Model
# ----------------------------
class MLP(nn.Module):
    """
    Multi-Layer Perceptron with configurable architecture.
    Each layer has: Linear → Activation → Dropout
    """
    def __init__(self, arch: Arch, input_dim, num_classes=NUM_CLASSES):
        super(MLP, self).__init__()
        layers = []
        cur_dim = input_dim

        # Build hidden layers
        for layer_config in arch:
            # Linear transformation
            layers.append(nn.Linear(cur_dim, layer_config['units']))

            # Activation function
            if layer_config['activation'] == 'relu':
                layers.append(nn.ReLU())
            elif layer_config['activation'] == 'tanh':
                layers.append(nn.Tanh())
            elif layer_config['activation'] == 'sigmoid':
                layers.append(nn.Sigmoid())

            # Dropout for regularization
            if layer_config['dropout'] > 0:
                layers.append(nn.Dropout(layer_config['dropout']))

            cur_dim = layer_config['units']

        # Output layer
        layers.append(nn.Linear(cur_dim, num_classes))

        self.network = nn.Sequential(*layers)

    def forward(self, x):
        return self.network(x)

# ----------------------------
# Weight inheritance helper
# ----------------------------
def try_inherit_weights(child: nn.Module, parent: nn.Module):
    child_dict = child.state_dict()
    parent_dict = parent.state_dict()
    matched = 0
    for k, v in parent_dict.items():
        if k in child_dict and child_dict[k].shape == v.shape:
            child_dict[k] = v.clone()
            matched += 1
    child.load_state_dict(child_dict)
    return matched

# ----------------------------
# Data loading
# ----------------------------
def load_breast_cancer_data():
    breast_cancer = fetch_ucirepo(id=17)
    X = breast_cancer.data.features
    y = breast_cancer.data.targets

    # Convert to numpy arrays
    X = X.values
    y = y.values.ravel()

    # Convert labels to 0 and 1 (M=1, B=0)
    y = (y == 'M').astype(int)

    # Split into train, val, test
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

    return X_train, X_val, X_test, y_train, y_val, y_test

# ----------------------------
# Training utils
# ----------------------------
def train_one_epoch(model, device, X_train, y_train, optimizer, criterion, batch_size=32):
    model.train()
    n_samples = len(X_train)
    indices = np.random.permutation(n_samples)

    running_loss = 0.0
    correct = 0
    total = 0

    for i in range(0, n_samples, batch_size):
        batch_idx = indices[i:i+batch_size]
        X_batch = torch.FloatTensor(X_train[batch_idx]).to(device)
        y_batch = torch.LongTensor(y_train[batch_idx]).to(device)

        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * len(batch_idx)
        _, predicted = outputs.max(1)
        total += len(batch_idx)
        correct += predicted.eq(y_batch).sum().item()

    avg_loss = running_loss / total
    acc = correct / total
    return avg_loss, acc

def evaluate(model, device, X, y, criterion=None, batch_size=32):
    model.eval()
    n_samples = len(X)

    correct = 0
    total = 0
    loss_sum = 0.0

    with torch.no_grad():
        for i in range(0, n_samples, batch_size):
            X_batch = torch.FloatTensor(X[i:i+batch_size]).to(device)
            y_batch = torch.LongTensor(y[i:i+batch_size]).to(device)

            outputs = model(X_batch)
            if criterion:
                loss = criterion(outputs, y_batch)
                loss_sum += loss.item() * len(y_batch)

            _, predicted = outputs.max(1)
            total += len(y_batch)
            correct += predicted.eq(y_batch).sum().item()

    acc = correct / total
    avg_loss = loss_sum / total if criterion else None
    return avg_loss, acc

# ----------------------------
# Evolutionary algorithm
# ----------------------------
Individual = namedtuple('Individual', ['preprocessing', 'arch', 'fitness', 'model_state', 'pipeline'])

def mutate_arch(arch: Arch, max_layers=6):
    new = copy.deepcopy(arch)
    ops = ['add', 'remove', 'modify']
    op = random.choice(ops)

    if op == 'add' and len(new) < max_layers:
        pos = random.randint(0, len(new))
        new.insert(pos, random_layer(0))
    elif op == 'remove' and len(new) > 1:
        pos = random.randrange(len(new))
        new.pop(pos)
    else:
        pos = random.randrange(len(new))
        field = random.choice(['units', 'activation', 'dropout'])
        if field == 'units':
            new[pos]['units'] = random.choice([8, 16, 32, 64, 128])
        elif field == 'activation':
            new[pos]['activation'] = random.choice(['relu', 'tanh', 'sigmoid'])
        else:
            new[pos]['dropout'] = random.choice([0.0, 0.1, 0.2, 0.3, 0.5])

    return new

def mutate_preprocessing(prep):
    new = copy.deepcopy(prep)
    field = random.choice(['scaler', 'feature_selection', 'n_features', 'add_polynomial'])

    if field == 'scaler':
        new['scaler'] = random.choice(['standard', 'minmax', 'robust', 'none'])
    elif field == 'feature_selection':
        new['feature_selection'] = random.choice(['none', 'pca', 'selectk'])
    elif field == 'n_features':
        new['n_features'] = random.choice([10, 15, 20, 25, 30])
    else:
        new['add_polynomial'] = not new['add_polynomial']

    return new

def evolve(population, X_train, y_train, X_val, y_val, device, args):
    population = sorted(population, key=lambda x: x.fitness if x.fitness is not None else 0.0, reverse=True)
    next_pop = []
    K = max(1, int(args.elitism * len(population)))
    next_pop.extend(population[:K])

    while len(next_pop) < args.pop_size:
        tournament = random.sample(population, k=min(args.tournament_k, len(population)))
        parent = max(tournament, key=lambda x: x.fitness if x.fitness is not None else 0.0)

        # Mutate both preprocessing and architecture
        if random.random() < 0.5:
            child_prep = mutate_preprocessing(parent.preprocessing)
            child_arch = parent.arch
        else:
            child_prep = parent.preprocessing
            child_arch = mutate_arch(parent.arch, max_layers=args.max_layers)

        # Apply preprocessing
        pipeline = PreprocessingPipeline(child_prep)
        X_train_proc = pipeline.fit(X_train, y_train)
        X_val_proc = pipeline.transform(X_val)

        # Build and train MLP model
        input_dim = X_train_proc.shape[1]
        child_model = MLP(child_arch, input_dim).to(device)

        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(child_model.parameters(), lr=args.lr, weight_decay=1e-4)

        for ep in range(args.train_epochs):
            train_one_epoch(child_model, device, X_train_proc, y_train, optimizer, criterion, args.batch_size)

        _, val_acc = evaluate(child_model, device, X_val_proc, y_val, None, args.batch_size)
        child_state = child_model.state_dict()
        child = Individual(preprocessing=child_prep, arch=child_arch, fitness=val_acc,
                         model_state=child_state, pipeline=pipeline)
        next_pop.append(child)

    return next_pop

# ----------------------------
# Main Evolution Run
# ----------------------------
def run_evolution(args):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print("Device:", device)
    print("Using Multi-Layer Perceptron (MLP) architecture")

    # Load data
    X_train, X_val, X_test, y_train, y_val, y_test = load_breast_cancer_data()
    print(f"Data loaded: Train={len(X_train)}, Val={len(X_val)}, Test={len(X_test)}")

    # Initialize population
    population = []
    print("\nInitializing population...")
    for i in range(args.pop_size):
        prep = random_preprocessing()
        arch = random_arch(30, min_layers=args.min_layers, max_layers=args.init_max_layers)

        # Apply preprocessing
        pipeline = PreprocessingPipeline(prep)
        X_train_proc = pipeline.fit(X_train, y_train)
        X_val_proc = pipeline.transform(X_val)

        # Build and train MLP model
        input_dim = X_train_proc.shape[1]
        model = MLP(arch, input_dim).to(device)
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=1e-4)

        for ep in range(args.init_train_epochs):
            train_one_epoch(model, device, X_train_proc, y_train, optimizer, criterion, args.batch_size)

        _, val_acc = evaluate(model, device, X_val_proc, y_val, None, args.batch_size)
        state = model.state_dict()
        population.append(Individual(preprocessing=prep, arch=arch, fitness=val_acc,
                                    model_state=state, pipeline=pipeline))
        print(f" Init {i+1}/{args.pop_size}: Prep={preprocess_to_str(prep)} | Arch={arch_to_str(arch)} | val_acc={val_acc:.4f}")

    best = None
    for gen in range(1, args.generations + 1):
        print(f"\n=== Generation {gen} ===")
        population = evolve(population, X_train, y_train, X_val, y_val, device, args)
        population = sorted(population, key=lambda x: x.fitness if x.fitness is not None else 0.0, reverse=True)
        best = population[0]
        print(f" Best gen {gen}: Prep={preprocess_to_str(best.preprocessing)} | Arch={arch_to_str(best.arch)} | val_acc={best.fitness:.4f}")

    print("\n=== Final Best ===")
    # Reconstruct best model with best preprocessing
    X_train_proc = best.pipeline.fit(X_train, y_train)
    X_test_proc = best.pipeline.transform(X_test)

    input_dim = X_train_proc.shape[1]
    best_model = MLP(best.arch, input_dim).to(device)
    best_model.load_state_dict(best.model_state)

    _, test_acc = evaluate(best_model, device, X_test_proc, y_test, None, args.batch_size)
    print(f" Best Preprocessing: {preprocess_to_str(best.preprocessing)}")
    print(f" Best Architecture: {arch_to_str(best.arch)}")
    print(f" Network Type: Multi-Layer Perceptron (MLP)")
    print(f" Val Acc: {best.fitness:.4f} | Test Acc: {test_acc:.4f}")
    return best

# ----------------------------
# Fixed Arguments
# ----------------------------
class Args:
    pop_size = 6
    generations = 4
    train_epochs = 3
    init_train_epochs = 3
    batch_size = 32
    lr = 0.001
    elitism = 0.3
    tournament_k = 3
    min_layers = 1
    init_max_layers = 3
    max_layers = 5

args = Args()
best_individual = run_evolution(args)

Device: cuda
Using Multi-Layer Perceptron (MLP) architecture
Data loaded: Train=398, Val=85, Test=86

Initializing population...
 Init 1/6: Prep=none|pca30+poly | Arch=U16SD0.3 | U32TD0.1 | val_acc=0.8471
 Init 2/6: Prep=minmax|selectk20+poly | Arch=U32RD0.0 | val_acc=0.8706
 Init 3/6: Prep=minmax|pca20 | Arch=U8RD0.5 | val_acc=0.3765
 Init 4/6: Prep=standard|selectk15 | Arch=U64RD0.5 | U128TD0.5 | val_acc=0.9294
 Init 5/6: Prep=standard|selectk20 | Arch=U64SD0.3 | U128TD0.0 | val_acc=0.9294
 Init 6/6: Prep=none|pca15+poly | Arch=U128TD0.1 | U8RD0.0 | val_acc=0.8471

=== Generation 1 ===
 Best gen 1: Prep=standard|selectk15 | Arch=U64RD0.5 | val_acc=0.9529

=== Generation 2 ===
 Best gen 2: Prep=standard|selectk15 | Arch=U64RD0.5 | val_acc=0.9529

=== Generation 3 ===
 Best gen 3: Prep=standard|selectk15 | Arch=U64RD0.5 | val_acc=0.9529

=== Generation 4 ===
 Best gen 4: Prep=standard|selectk15 | Arch=U64RD0.5 | val_acc=0.9529

=== Final Best ===
 Best Preprocessing: standard|selectk15