In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm

import torch
from torch import nn, optim
from torch.optim.optimizer import Optimizer

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler

In [None]:
df = pd.read_csv('breast-cancer.csv')

In [None]:
df.info()

In [None]:
df = df.drop(columns=['id'], errors='ignore')

df['diagnosis'] = df['diagnosis'].map({'B': 0, 'M': 1})

X = df.drop(columns=['diagnosis']).values
y = df['diagnosis'].values

In [None]:
activation_dict = {
    "relu": nn.ReLU,
    "tanh": nn.Tanh,
    "sigmoid": nn.Sigmoid
}

In [None]:
class MySGD(Optimizer):
    def __init__(self,
                 params,
                 lr=0.01):
        if lr < 0.0:
            raise ValueError(f"Некорректное значение lr: {lr}")

        defaults = dict(lr=lr)
        super().__init__(params, defaults)

    def step(self, closure=None):
        loss = None
        if closure is not None:
            with torch.enable_grad():
                loss = closure()

        for group in self.param_groups:
            lr = group['lr']

            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data

                p.data.add_(grad, alpha=-lr)

        return loss

In [None]:
class MyNAG(Optimizer):
    def __init__(self,
                 params,
                 lr=0.01,
                 momentum=0.9):
        if lr < 0.0:
            raise ValueError(f"Некорректное значение lr: {lr}")
        if momentum < 0.0:
            raise ValueError(f"Некорректное значение momentum: {momentum}")

        defaults = dict(lr=lr,
                        momentum=momentum)
        super().__init__(params, defaults)

    def step(self, closure=None):
        loss = None
        if closure is not None:
            with torch.enable_grad():
                loss = closure()

        for group in self.param_groups:
            lr = group['lr']
            momentum = group['momentum']

            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data

                param_state = self.state[p]
                if 'momentum_buffer' not in param_state:
                    buf = param_state['momentum_buffer'] = grad.clone().detach()
                else:
                    buf = param_state['momentum_buffer']
                    buf.mul_(momentum).add_(grad)

                nesterov_grad = grad.add(buf, alpha=momentum)
                p.data.add_(nesterov_grad, alpha=-lr)

        return loss

In [None]:
class MyAdagrad(Optimizer):
    def __init__(self,
                 params,
                 lr=1e-2,
                 eps=1e-10):
        if lr < 0.0:
            raise ValueError(f"Некорректное значение lr: {lr}")
        if eps < 0.0:
            raise ValueError(f"Некорректное значение eps: {eps}")

        defaults = dict(lr=lr,
                        eps=eps,
                        step=0)
        super().__init__(params, defaults)

    def step(self, closure=None):
        loss = None
        if closure is not None:
            with torch.enable_grad():
                loss = closure()

        for group in self.param_groups:
            lr = group['lr']
            eps = group['eps']

            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data

                state = self.state[p]
                if 'sum_squares' not in state:
                    state['sum_squares'] = torch.zeros_like(p.data)

                sum_squares = state['sum_squares']
                sum_squares.addcmul_(grad, grad, value=1.0)

                p.data.addcdiv_(grad, sum_squares.sqrt().add_(eps), value=-lr)

        return loss

In [None]:
class MyRMSprop(Optimizer):
    def __init__(self,
                 params,
                 lr=1e-2,
                 alpha=0.99,
                 eps=1e-8):
        if lr < 0.0:
            raise ValueError(f"Некорректное значение lr: {lr}")
        if alpha < 0.0:
            raise ValueError(f"Некорректное значение alpha: {alpha}")
        if eps < 0.0:
            raise ValueError(f"Некорректное значение eps: {eps}")

        defaults = dict(lr=lr,
                        alpha=alpha,
                        eps=eps)
        super().__init__(params, defaults)

    def step(self, closure=None):
        loss = None
        if closure is not None:
            with torch.enable_grad():
                loss = closure()

        for group in self.param_groups:
            lr = group['lr']
            alpha = group['alpha']
            eps = group['eps']

            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data

                state = self.state[p]

                if 'square_avg' not in state:
                    state['square_avg'] = torch.zeros_like(p.data)
                square_avg = state['square_avg']

                square_avg.mul_(alpha).addcmul_(grad, grad, value=1 - alpha)

                denom = square_avg.sqrt().add_(eps)

                p.data.addcdiv_(grad, denom, value=-lr)

        return loss

In [None]:
class MyAdam(Optimizer):
    def __init__(self,
                 params,
                 lr=1e-3,
                 betas=(0.9, 0.999),
                 eps=1e-8):
        if lr < 0.0:
            raise ValueError(f"Некорректное значение lr: {lr}")
        if not 0.0 <= betas[0] < 1.0:
            raise ValueError(f"Некорректное значение beta1: {betas[0]}")
        if not 0.0 <= betas[1] < 1.0:
            raise ValueError(f"Некорректное значение beta2: {betas[1]}")
        if eps < 0.0:
            raise ValueError(f"Некорректное значение eps: {eps}")

        defaults = dict(lr=lr,
                        betas=betas,
                        eps=eps,
                        step=0)
        super().__init__(params, defaults)

    def step(self, closure=None):
        loss = None
        if closure is not None:
            with torch.enable_grad():
                loss = closure()

        for group in self.param_groups:
            lr = group['lr']
            betas = group['betas']
            eps = group['eps']

            group['step'] += 1
            step_num = group['step']
            beta1, beta2 = betas

            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data

                state = self.state[p]

                if 'exp_avg' not in state:
                    state['exp_avg'] = torch.zeros_like(p.data)
                exp_avg = state['exp_avg']

                if 'exp_avg_sq' not in state:
                    state['exp_avg_sq'] = torch.zeros_like(p.data)
                exp_avg_sq = state['exp_avg_sq']

                exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
                exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)

                denom = exp_avg_sq.sqrt().add_(eps)

                bias_correction1 = 1 - beta1 ** step_num
                bias_correction2 = 1 - beta2 ** step_num
                step_size = lr * (bias_correction2 ** 0.5) / bias_correction1

                p.data.addcdiv_(exp_avg, denom, value=-step_size)

        return loss

In [None]:
def create_model(input_dim, hidden_config, output_dim=1):
    """
    Создаёт MLP-модель с заданной конфигурацией:
      hidden_config: [(n_neurons, activation_name), (n_neurons, activation_name), ...]
      output_dim: количество нейронов на выходном слое (у нас 1, т.к. бинарная классификация)
    """
    layers = []
    in_dim = input_dim

    for (n_neurons, act_name) in hidden_config:
        layers.append(nn.Linear(in_dim, n_neurons))
        layers.append(activation_dict[act_name]())
        in_dim = n_neurons

    # Выходной слой
    layers.append(nn.Linear(in_dim, output_dim))
    # Для бинарной классификации (через BCEWithLogitsLoss) активацию Sigmoid не добавляем здесь
    model = nn.Sequential(*layers)
    return model

In [None]:
def train_model(model, 
                X_train, y_train, 
                X_val, y_val, 
                epochs=20, 
                batch_size=32, 
                lr=1e-3, 
                device='cpu',
                optimizer='Adam'):
    X_train_t = torch.tensor(X_train, dtype=torch.float32).to(device)
    y_train_t = torch.tensor(y_train, dtype=torch.float32).view(-1, 1).to(device)
    X_val_t   = torch.tensor(X_val,   dtype=torch.float32).to(device)
    y_val_t   = torch.tensor(y_val,   dtype=torch.float32).view(-1, 1).to(device)
    
    model.to(device)

    if optimizer == 'Adam':
        optimizer = MyAdam(model.parameters(), lr=lr)
    elif optimizer == 'NAG':
        optimizer = MyNAG(model.parameters(), lr=lr)
    elif optimizer == 'RMSProp':
        optimizer = MyRMSprop(model.parameters(), lr=lr)
    elif optimizer == 'Adagrad':
        optimizer = MyAdagrad(model.parameters(), lr=lr)
    elif optimizer == 'SGD':
        optimizer = MySGD(model.parameters(), lr=lr)
    else:
        raise ValueError('Unknow optimizer parameter')
        
    criterion = nn.BCEWithLogitsLoss()

    for epoch in range(epochs):
        model.train()
        permutation = torch.randperm(X_train_t.size(0))
        for i in range(0, X_train_t.size(0), batch_size):
            optimizer.zero_grad()
            indices = permutation[i:i+batch_size]
            batch_x, batch_y = X_train_t[indices], y_train_t[indices]

            outputs = model(batch_x)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()

    model.eval()
    with torch.no_grad():
        val_outputs = model(X_val_t)
        val_preds = (torch.sigmoid(val_outputs) >= 0.5).float()
        correct = (val_preds == y_val_t).sum().item()
        val_acc = correct / len(y_val_t)

    return val_acc

In [None]:
def evaluate_model_cv(hidden_config, X, y, n_splits=5, epochs=20, batch_size=32, lr=1e-3, device='cpu', optimizer='Adam', verbose=False):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    fold_accuracies = []
    
    iterator = enumerate(skf.split(X, y))
    
    if verbose:
        iterator = tqdm(iterator)

    for idx, (train_idx, val_idx) in iterator:
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_val   = scaler.transform(X_val)

        input_dim = X.shape[1]
        model = create_model(input_dim, hidden_config)

        val_acc = train_model(model, 
                              X_train, y_train, 
                              X_val,   y_val, 
                              epochs=epochs, 
                              batch_size=batch_size, 
                              lr=lr, 
                              device=device,
                              optimizer=optimizer)
        fold_accuracies.append(val_acc)

    return np.mean(fold_accuracies)

In [None]:
evaluate_model_cv(
    [(16, "relu")],
    X,
    y,
    lr=1e-2,
    n_splits=100,
    device='cuda',
    optimizer='SGD',
    verbose=True,
)

In [None]:
evaluate_model_cv(
    [(16, "relu")],
    X,
    y,
    lr=1e-2,
    n_splits=100,
    device='cuda',
    optimizer='NAG',
    verbose=True,
)

In [None]:
evaluate_model_cv(
    [(16, "relu")],
    X,
    y,
    lr=1e-2,
    n_splits=100,
    device='cuda',
    optimizer='Adagrad',
    verbose=True,
)

In [None]:
evaluate_model_cv(
    [(16, "relu")],
    X,
    y,
    lr=1e-2,
    n_splits=100,
    device='cuda',
    optimizer='RMSProp',
    verbose=True,
)

In [None]:
evaluate_model_cv(
    [(16, "relu")],
    X,
    y,
    lr=1e-2,
    n_splits=100,
    device='cuda',
    optimizer='Adam',
    verbose=True,
)

# Genetic algorithm

In [None]:
"""
Будем кодировать гиперпараметры так:
- Число скрытых слоёв H: 1..3 (например)
- Для каждого слоя: количество нейронов N: 4..64 (пример диапазона)
- Для каждого слоя: функция активации: ['relu', 'tanh', 'sigmoid']

То есть, если H=2, то особь должна хранить:
[
  (n_neurons_layer1, activation_layer1),
  (n_neurons_layer2, activation_layer2)
]

Если H=1 — только один слой, если H=3 — три слоя и т.д.

Для упрощения можно хранить в особи структуру вида:
[H, n1, act1, n2, act2, n3, act3]

Но придётся аккуратно интерпретировать в функции evaluate.
"""

H_min, H_max = 1, 5
N_min, N_max = 4, 128
activations = list(activation_dict.keys())  # ["relu", "tanh", "sigmoid"]

In [None]:
class Individual:
    def __init__(self, genes):
        self.genes = genes
        self.fitness = 0.0


class GeneticOptimizer:
    def __init__(self, pop_size=15, n_generations=10, cx_prob=0.5,
                 mut_prob=0.2, n_epochs=5):
        self.pop_size = pop_size
        self.n_generations = n_generations
        self.cx_prob = cx_prob
        self.mut_prob = mut_prob
        self.n_epochs = n_epochs
        self.hall_of_fame = None

    def _create_individual(self):
        genes = [
            random.randint(H_min, H_max + 1),  # num layers
            random.randint(N_min, N_max + 1),
            random.choice(activations),
            random.randint(N_min, N_max + 1),
            random.choice(activations),
            random.randint(N_min, N_max + 1),
            random.choice(activations),
            random.randint(N_min, N_max + 1),
            random.choice(activations),
            random.randint(N_min, N_max + 1),
            random.choice(activations),
        ]
        return Individual(genes)

    def initialize_population(self):
        return [self._create_individual() for _ in range(self.pop_size)]
    
    def decode_individual(self, ind):
        H = ind.genes[0]
        hidden_config = []
        for i in range(H):
            n_i = ind.genes[1 + i*2]
            a_i = ind.genes[2 + i*2]
            hidden_config.append((n_i, a_i))
        return hidden_config

    def evaluate_population(self, population, X, y):
        for ind in population:
            hidden_config = self.decode_individual(ind)

            acc = evaluate_model_cv(hidden_config, X, y, 
                                    n_splits=100,
                                    epochs=20, 
                                    batch_size=32,
                                    lr=1e-2,
                                    device='cuda')
            ind.fitness = acc

        # Update hall of fame
        current_best = max(population, key=lambda x: x.fitness)
        if not self.hall_of_fame or current_best.fitness > self.hall_of_fame.fitness:
            self.hall_of_fame = current_best

    def _select_parent(self, population):
        tournament = random.sample(population, 3)
        return max(tournament, key=lambda x: x.fitness)

    def _crossover(self, parent1, parent2):
        if random.random() > self.cx_prob:
            return parent1, parent2
        
        cxpoint = random.randint(1, len(ind1))
        child1 = Individual(parent1.genes[:cxpoint] + parent2.genes[cxpoint])
        child2 = Individual(parent2.genes[:cxpoint] + parent1.genes[cxpoint])
        return child1, child2

    def _mutate(self, individual):
        for i in range(len(individual.genes)):
            if random.random() < self.mut_prob:
                if i == 0:
                    individual.genes[i] = random.randint(H_min, H_max+1)
                elif i % 2 == 1:
                    individual.genes[i] = random.randint(N_min, N_max+1)
                else:
                    if i > 0:
                        individual.genes[i] = np.random.choice(activations)
        return individual

    def evolve(self, X, y):
        population = self.initialize_population()

        for gen in range(self.n_generations):
            self.evaluate_population(population, X, y)

            fitnesses = [ind.fitness for ind in population]
            print(f"\nGeneration {gen + 1}/{self.n_generations}")
            print(f"Max Fitness: {max(fitnesses):.2f}")
            print(f"Avg Fitness: {np.mean(fitnesses):.2f}")
            print(f"Min Fitness: {min(fitnesses):.2f}")

            new_pop = []
            while len(new_pop) < self.pop_size:
                parent1 = self._select_parent(population)
                parent2 = self._select_parent(population)
                child1, child2 = self._crossover(parent1, parent2)

                for child in [child1, child2]:
                    if len(new_pop) >= self.pop_size:
                        break
                    self._mutate(child)
                    new_pop.append(child)

            population = new_pop

        return self.hall_of_fame

In [None]:
import time

BATCH_SIZE = 64
DEVICES = [0, 1, 2]
N_GENERATIONS = 10
POP_SIZE = 15

ga = GeneticOptimizer(
        pop_size=POP_SIZE,
        n_generations=N_GENERATIONS,
        n_epochs=50
    )

start = time.monotonic()
best = ga.evolve(X, y)
end = time.monotonic()