In [10]:
import pandas as pd
import numpy as np
import random
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, log_loss
from scipy.special import softmax

df_test = pd.read_csv('mnist_test.csv')

y = df_test['label'].values
X = df_test.drop('label', axis=1).values
X = X / 255.0

encoder = OneHotEncoder(sparse_output=False)
y = encoder.fit_transform(y.reshape(-1, 1))

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_derivative(x):
    return x * (1 - x)

generations = 10
population_size = 10
mutation_rate = 0.1

# Define Genetic Algorithm Functions
def create_chromosome():
    learning_rate = np.random.uniform(0.001, 0.1)
    hidden_size_1 = np.random.choice([32, 64, 128, 256])
    hidden_size_2 = np.random.choice([32, 64, 128, 256])
    return [learning_rate, hidden_size_1, hidden_size_2]

def fitness(chromosome, X, y):
    learning_rate, hidden_size_1, hidden_size_2 = chromosome
    return train_and_evaluate(X, y, learning_rate, hidden_size_1, hidden_size_2)

def select(population, fitness_scores):
    sorted_population = [x for _, x in sorted(zip(fitness_scores, population), reverse=True)]
    return sorted_population[:population_size // 2]

def crossover(parent1, parent2):
    point = np.random.randint(1, len(parent1))
    child1 = parent1[:point] + parent2[point:]
    child2 = parent2[:point] + parent1[point:]
    return child1, child2

def mutate(chromosome, mutation_rate):
    if np.random.rand() < mutation_rate:
        index = np.random.randint(0, len(chromosome))
        if index == 0:
            chromosome[index] = np.random.uniform(0.001, 0.1)
        else:
            chromosome[index] = np.random.choice([32, 64, 128, 256])
    return chromosome

def decode_chromosome(chromosome):
    return chromosome

def train_and_evaluate(X, y, learning_rate, hidden_size_1, hidden_size_2):
    # Initialize parameters
    input_size = 28 * 28
    output_size = 10

    # Weights initialization
    np.random.seed(42)
    W1 = np.random.rand(input_size, hidden_size_1) * 0.01
    b1 = np.zeros((1, hidden_size_1))
    W2 = np.random.rand(hidden_size_1, hidden_size_2) * 0.01
    b2 = np.zeros((1, hidden_size_2))
    W3 = np.random.rand(hidden_size_2, output_size) * 0.01
    b3 = np.zeros((1, output_size))

    epochs = 10
    batch_size = 32

    # Training loop
    for epoch in range(epochs):
        for i in range(0, X_train.shape[0], batch_size):
            # Get mini-batch
            X_batch = X_train[i:i + batch_size]
            y_batch = y_train[i:i + batch_size]

            # Forward propagation
            Z1 = np.dot(X_batch, W1) + b1
            A1 = sigmoid(Z1)
            Z2 = np.dot(A1, W2) + b2
            A2 = sigmoid(Z2)
            Z3 = np.dot(A2, W3) + b3
            A3 = sigmoid(Z3)

            # Backpropagation
            dZ3 = A3 - y_batch
            dW3 = np.dot(A2.T, dZ3) / batch_size
            db3 = np.sum(dZ3, axis=0, keepdims=True) / batch_size

            dZ2 = np.dot(dZ3, W3.T) * sigmoid_derivative(A2)
            dW2 = np.dot(A1.T, dZ2) / batch_size
            db2 = np.sum(dZ2, axis=0, keepdims=True) / batch_size

            dZ1 = np.dot(dZ2, W2.T) * sigmoid_derivative(A1)
            dW1 = np.dot(X_batch.T, dZ1) / batch_size
            db1 = np.sum(dZ1, axis=0, keepdims=True) / batch_size

            # Update weights and biases
            W1 -= learning_rate * dW1
            b1 -= learning_rate * db1
            W2 -= learning_rate * dW2
            b2 -= learning_rate * db2
            W3 -= learning_rate * dW3
            b3 -= learning_rate * db3

    # Evaluate on validation set
    Z1_val = np.dot(X_val, W1) + b1
    A1_val = sigmoid(Z1_val)
    Z2_val = np.dot(A1_val, W2) + b2
    A2_val = sigmoid(Z2_val)
    Z3_val = np.dot(A2_val, W3) + b3
    A3_val = sigmoid(Z3_val)
    A3_val = softmax(A3_val, axis=1)

    # Calculate accuracy
    val_predictions = np.argmax(A3_val, axis=1)
    val_true = np.argmax(y_val, axis=1)
    val_accuracy = accuracy_score(val_true, val_predictions)

    return val_accuracy

# Genetic Algorithm Execution
population = [create_chromosome() for _ in range(population_size)]
for generation in range(generations):
    fitness_scores = [fitness(chromosome, X_train, y_train) for chromosome in population]
    print(f"Generation {generation+1}, Best Fitness: {max(fitness_scores):.4f}")
    
    selected = select(population, fitness_scores)
    next_generation = []
    
    while len(next_generation) < population_size:
        parent1, parent2 = random.sample(selected, 2)
        child1, child2 = crossover(parent1, parent2)
        next_generation.append(mutate(child1, mutation_rate))
        next_generation.append(mutate(child2, mutation_rate))
    
    population = next_generation

best_chromosome = max(population, key=lambda chromo: fitness(chromo, X_train, y_train))
best_learning_rate, best_hidden_size_1, best_hidden_size_2 = decode_chromosome(best_chromosome)
print(f"Optimal Hyperparameters: Learning Rate = {best_learning_rate}, Hidden Sizes = {best_hidden_size_1}, {best_hidden_size_2}")

Generation 1, Best Fitness: 0.3335
Generation 2, Best Fitness: 0.3205
Generation 3, Best Fitness: 0.3335
Generation 4, Best Fitness: 0.3335
Generation 5, Best Fitness: 0.3335
Generation 6, Best Fitness: 0.3335
Generation 7, Best Fitness: 0.3335
Generation 8, Best Fitness: 0.3335
Generation 9, Best Fitness: 0.3335
Generation 10, Best Fitness: 0.3335
Optimal Hyperparameters: Learning Rate = 0.09674685321847672, Hidden Sizes = 128, 256
