In [356]:
import pandas as pd
from sklearn.metrics.cluster import v_measure_score

In [357]:
df = pd.read_csv('breast_cancer_coimbra_train.csv')

In [None]:
import numpy as np
import random
import operator
from copy import deepcopy

# Define the operations allowed in the trees
OPERATORS = {
    'add': operator.add,
    'sub': operator.sub,
    'mul': operator.mul,
    'div': operator.truediv  # safe division
}

TERMINALS = df.columns[:-1]  # Your dataset's features
TERMINALS


# Node class to represent tree nodes (either operator or terminal)
class Node:
    def __init__(self, value, left=None, right=None):
        self.value = value
        self.left = left
        self.right = right

    def evaluate(self, features):
        """ Recursively evaluate the expression tree based on feature values in a DataFrame row """
        if isinstance(self.value, str):
            if self.value in OPERATORS:
                # Recursively evaluate left and right subtrees
                left_val = self.left.evaluate(features)
                right_val = self.right.evaluate(features)
                return OPERATORS[self.value](left_val, right_val)
            else:
                # It's a terminal (feature), so we access the DataFrame row using the column name
                return features[self.value]  # Assumes 'features' is a pandas row
        else:
            # It's a constant value (e.g., a number)
            return self.value   # it's a constant



# Random Tree Initialization
def generate_random_tree(depth=3):
    if depth == 0:
        # Base case: return a terminal node
        terminal = random.choice(TERMINALS)
        return Node(terminal)
    else:
        # Recursive case: return an operator node with two subtrees
        operator = random.choice(list(OPERATORS.keys()))
        left_subtree = generate_random_tree(depth - 1)
        right_subtree = generate_random_tree(depth - 1)
        return Node(operator, left_subtree, right_subtree)


# Fitness function (mean squared error)
def fitness_function(individual, X, y):
    predictions = np.array(individual.evaluate(X.iloc[:]))
    return np.mean((predictions - y) ** 2)


# Tournament Selection
def tournament_selection(population, fitnesses, tournament_size=3):
    selected = random.sample(list(zip(population, fitnesses)), tournament_size)
    return min(selected, key=lambda x: x[1])[0]  # Return the individual with the best fitness


# Crossover between two trees
def crossover(tree1, tree2, max_depth=3):
    if random.random() < 0.9:  # 70% chance to swap subtrees
        if isinstance(tree1.value, str) and tree1.value in OPERATORS:
            return Node(tree1.value, crossover(tree1.left, tree2.left, max_depth-1), crossover(tree1.right, tree2.right, max_depth-1))
        return deepcopy(tree2)
    else:
        return deepcopy(tree1)


# Mutation (replace a subtree with a random subtree)
def mutation(tree, max_depth=3):
    if random.random() < 0.1:  # 10% chance of mutation
        return generate_random_tree(max_depth)
    else:
        if isinstance(tree.value, str) and tree.value in OPERATORS:
            return Node(tree.value, mutation(tree.left, max_depth-1), mutation(tree.right, max_depth-1))
        return deepcopy(tree)

def print_tree(node, depth=0):
    """ Recursively prints the structure of the tree with indentation for clarity """
    indent = "  " * depth  # Indentation based on the depth of the node
    if node.left is None and node.right is None:
        # It's a terminal (leaf) node
        print(f"{indent}{node.value}")
    else:
        # It's an operator node
        print(f"{indent}{node.value}")
        # Recursively print left and right subtrees
        if node.left:
            print_tree(node.left, depth + 1)
        if node.right:
            print_tree(node.right, depth + 1)

# Evolutionary Algorithm (Genetic Programming)
def evolve_population(X, y, k,pop_size=100, generations=10, max_depth=3):
    # Step 1: Initialize the population
    population = [generate_random_tree(max_depth) for _ in range(pop_size)]

    for generation in range(generations):
        # Step 2: Evaluate fitness
        fitnesses = [fitness_function(ind, X, y) for ind in population]

        # Step 3: Selection and reproduction
        new_population = []
        while len(new_population) < pop_size:
            # Tournament selection
            parent1 = tournament_selection(population, fitnesses,k)
            parent2 = tournament_selection(population, fitnesses,k)

            # Crossover
            child = crossover(parent1, parent2, max_depth)

            # Mutation
            child = mutation(child, max_depth)

            new_population.append(child)

        # Step 4: Replace the old population with the new population
        population = new_population

        # Print best fitness every generation
        best_fitness = min(fitnesses)
        worst_fitness = max(fitnesses)
        print(f"Generation {generation + 1}, Best Fitness: {best_fitness}")
        print(f"Generation {generation + 1}, Worst Fitness: {worst_fitness}")

    # Return the best individual after evolution
    best_individual = min(population, key=lambda ind: fitness_function(ind, X, y))
    return best_individual




In [359]:
df = pd.read_csv('breast_cancer_coimbra_train.csv')

In [360]:
X = df.drop(columns=['Classification'])
y = df['Classification']
best_tree = evolve_population(X, y, 10,pop_size=100, generations=100, max_depth=7)
print(f"Best individual: {best_tree}")
print(f"Best fitness: {fitness_function(best_tree, X, y)}")

Generation 1, Best Fitness: 2.6422448374964485
Generation 1, Worst Fitness: inf
Generation 2, Best Fitness: 2.5223880597014925
Generation 2, Worst Fitness: inf
Generation 3, Best Fitness: 2.50270096661447
Generation 3, Worst Fitness: inf
Generation 4, Best Fitness: 2.6144964724113855
Generation 4, Worst Fitness: inf
Generation 5, Best Fitness: 2.5660550396611304
Generation 5, Worst Fitness: inf
Generation 6, Best Fitness: 2.4137046139127394
Generation 6, Worst Fitness: inf
Generation 7, Best Fitness: 2.6033291831690435
Generation 7, Worst Fitness: inf
Generation 8, Best Fitness: 2.6466870877808506
Generation 8, Worst Fitness: inf
Generation 9, Best Fitness: 2.3622258643407976
Generation 9, Worst Fitness: inf
Generation 10, Best Fitness: 1.9960642967107718
Generation 10, Worst Fitness: inf
Generation 11, Best Fitness: 1.4156711028532982
Generation 11, Worst Fitness: inf
Generation 12, Best Fitness: 1.5569401616686347
Generation 12, Worst Fitness: inf
Generation 13, Best Fitness: 2.46065

In [361]:
a = generate_random_tree(2)
b = generate_random_tree(2)

In [362]:
print_tree(a)

add
  mul
    Adiponectin
    Insulin
  add
    BMI
    BMI


In [363]:
print_tree(b)

div
  sub
    MCP.1
    Insulin
  mul
    Resistin
    Resistin


In [364]:
x = crossover(a,b,2)

In [365]:
print_tree(x)

add
  mul
    Adiponectin
    Insulin
  add
    BMI
    BMI
