In [1]:
from sklearn.utils import resample
import operator
import math
import random
import sys
import numpy

from functools import partial

from deap import algorithms
from deap import base
from deap import creator
from deap import tools
from deap import gp

from sklearn.metrics import silhouette_score

from DataManagment import *
# from Evaluators import *
# from Metrics import *
# from ResultsManagment import *
# from Visualization import *
# from Testing.testEvaluation import get_metric

In [2]:
#disable logging
import logging
logging.getLogger().setLevel(logging.WARNING)

In [3]:
#HARD ASSUMPTIONS:
#all features are positive

In [4]:
#Experiment parameters
SEED = None 
FRACTION = 0.5

DATASET_NAME = "iris"
DATASET = DistDataLoader("../data").load_dataset(DATASET_NAME)
#generation method has to be finally a gp. class
GENERATION_METHOD = gp.genHalfAndHalf
GENERATION_PARAMETERS = {"min_" :1,"max_":2}
#individual parameters
MAX_DEPTH = 69 #HARD MAX FROM PYTHON:91
MAX_DEPTH = 69 #HARD MAX FROM PYTHON:91
#selection method
SELECTION_METHOD = tools.selTournament
SELECTION_PARAMETERS = {"tournsize":3}
#crossover
CROSSOVER_METHOD = gp.cxOnePoint
CROSSOVER_PARAMETERS = {}
#mutation individual
MUTATION_METHOD = gp.genFull
MUTATION_PARAMETERS = {"min_" :0,"max_":2}
#muatation policy
POLICY_MUTATION = gp.mutUniform
#!there is POLICY_MUTATION_PARAMS defined later

#GENERAL EVOLUTION
POPULATION_SIZE = 300
CROSSOVER_PROB = 0.5
MUTATION_PROB = 0.1
NUMBER_OF_GENERATIONS = 200



Loading Iris dataset from: ../data/iris.csv


In [5]:
#dataset loading
loader = DistDataLoader("../data")
dataset_name = "iris"
iris_dataset = loader.load_dataset(dataset_name)

if iris_dataset is None:
        print(f"❌ Failed to load {dataset_name}")

Loading Iris dataset from: ../data/iris.csv


In [6]:
#protective primitives deffinition

def protectedDiv(left, right):
    try:
        return left / right
    except ZeroDivisionError:
        return 1
def protectedLog(x):
    try:
        return numpy.log(x) if x > 0 else 1
    except Exception:
        return 1
    
def protectedCos(x):
    return numpy.clip(numpy.cos(x), -1, 1) + 2

def protectedSin(x):
    return numpy.clip(numpy.sin(x), -1, 1) + 2

In [7]:
n_features = len(DATASET.feature_names)
pset = gp.PrimitiveSet("MAIN", n_features * 2)

all_names = []
for name in DATASET.feature_names:
    all_names.append(name + "1")
for name in DATASET.feature_names:
    all_names.append(name + "2")

arg_map = {f"ARG{i}": name for i, name in enumerate(all_names)}

pset.renameArguments(**arg_map)

In [8]:
#adding primitives
pset.addPrimitive(protectedDiv, 2)
pset.addPrimitive(protectedLog,1)
pset.addPrimitive(protectedCos, 1)
pset.addPrimitive(protectedSin, 1)

pset.addPrimitive(operator.add, 2)
pset.addPrimitive(operator.sub, 2)
pset.addPrimitive(operator.mul, 2)

In [9]:
pset.addEphemeralConstant("rand13", partial(random.randint, 1, 3))

In [10]:
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", gp.PrimitiveTree, fitness=creator.FitnessMax)

In [11]:
toolbox = base.Toolbox()
#TODO check different generations
toolbox.register("expr", GENERATION_METHOD, pset=pset,**GENERATION_PARAMETERS)
toolbox.register("individual", tools.initIterate, creator.Individual, toolbox.expr)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
toolbox.register("compile", gp.compile, pset=pset)

In [12]:
#evalutaion function
#remember about returning a tuple
def silhouetteFitness(individual, dataset: Dataset = DATASET, fract: float = FRACTION, random_state: int = random.randint(0, 9999)):
    try:
        func = toolbox.compile(expr=individual)
    except MemoryError:
        print('''
              --------------------------
              THERE WAS A MEMORY ERROR!
              --------------------------
              ''')
        print(sys.exc_info())
        return -1

    Xsample, ysample = resample(
        dataset.X, dataset.y, 
        n_samples=int(len(dataset.X) * fract), 
        random_state=random_state
    )

    feature_names_1 = []
    feature_names_2 = []

    for name in dataset.feature_names:
        feature_names_1.append(name + "1")
        feature_names_2.append(name + "2")

    # Define a metric function that takes two 1D arrays
    def metric(u, v):
        # Map feature names to values for func
        args_u = {name: val for name, val in zip(feature_names_1, u)}
        args_v = {name: val for name, val in zip(feature_names_2, v)}
        args_x = {**args_u, **args_v}
        return max(0,func(**args_x))

    try: 
        result = silhouette_score(Xsample, ysample, metric=metric)
    except ValueError:
        print('''
              *****************************
              THERE WAS AN ERROR CALLING SILHOUETTE!
              *****************************
              ''')
        return -1
    return result,

In [13]:
toolbox.register("evaluate", silhouetteFitness)
toolbox.register("select", tools.selTournament, **SELECTION_PARAMETERS)
toolbox.register("mate", CROSSOVER_METHOD,**CROSSOVER_PARAMETERS)
toolbox.register("expr_mut", MUTATION_METHOD, **MUTATION_PARAMETERS)
#TODO remember about this
toolbox.register("expr_mut", gp.genFull, min_=0, max_=2)
toolbox.register("mutate", gp.mutUniform, expr=toolbox.expr_mut, pset=pset)

toolbox.decorate("mate", gp.staticLimit(key=operator.attrgetter("height"), max_value=MAX_DEPTH))
toolbox.decorate("mutate", gp.staticLimit(key=operator.attrgetter("height"), max_value=MAX_DEPTH))

In [14]:
#MAIN
pop = toolbox.population(n=POPULATION_SIZE)
hof = tools.HallOfFame(1)

stats_fit = tools.Statistics(lambda ind: ind.fitness.values)
stats_size = tools.Statistics(len)
mstats = tools.MultiStatistics(fitness=stats_fit, size=stats_size)
mstats.register("avg", numpy.mean)
mstats.register("std", numpy.std)
mstats.register("min", numpy.min)
mstats.register("max", numpy.max)

# pop, log = algorithms.eaSimple(pop, toolbox, CROSSOVER_PROB, MUTATION_PROB, NUMBER_OF_GENERATIONS, stats=mstats,
#                                    halloffame=hof, verbose=True)
pop, log = algorithms.eaSimple(pop, toolbox, CROSSOVER_PROB, MUTATION_PROB, 20, stats=mstats,
                                   halloffame=hof, verbose=True)

   	      	                                 fitness                                 	                      size                     
   	      	-------------------------------------------------------------------------	-----------------------------------------------
gen	nevals	avg       	gen	max     	min     	nevals	std      	avg    	gen	max	min	nevals	std    
0  	300   	-0.0664703	0  	0.692018	-0.29955	300   	0.0892395	3.36333	0  	7  	2  	300   	1.38732
1  	137   	-0.0232903	1  	0.692018	-0.324005	137   	0.0855288	3.5    	1  	7  	1  	137   	1.46856
2  	178   	-0.0144593	2  	0.439923	-0.311727	178   	0.102068 	3.87   	2  	11 	1  	178   	1.77006


  return left / right
  sil_samples = inter_clust_dists - intra_clust_dists
  return left / right
  sil_samples = inter_clust_dists - intra_clust_dists


3  	150   	0.017543  	3  	0.52345 	-0.426247	150   	0.116446 	4.04667	3  	12 	1  	150   	1.87559


  return left / right
  sil_samples = inter_clust_dists - intra_clust_dists
  return left / right


4  	167   	0.0390723 	4  	0.79041 	-0.508769	167   	0.137269 	4.08   	4  	11 	1  	167   	1.98165


  return left / right
  sil_samples = inter_clust_dists - intra_clust_dists
  return left / right
  return left / right
  sil_samples = inter_clust_dists - intra_clust_dists


5  	139   	0.0678944 	5  	0.79041 	-0.431882	139   	0.145828 	4.14333	5  	12 	1  	139   	2.04519


  return left / right
  sil_samples = inter_clust_dists - intra_clust_dists
  return left / right
  return left / right
  return left / right
  return left / right


6  	159   	0.117578  	6  	0.79041 	-0.560374	159   	0.197293 	4.30667	6  	10 	1  	159   	1.8813 


  return left / right
  return left / right
  return left / right
  return left / right
  return left / right
  return left / right
  return left / right
  return left / right


7  	139   	0.184065  	7  	0.79041 	-0.528565	139   	0.252291 	4.78   	7  	12 	1  	139   	2.05871


  return left / right
  return left / right
  return left / right
  return left / right
  return left / right
  return left / right


8  	173   	0.211195  	8  	0.888934	-0.413146	173   	0.280182 	4.91333	8  	13 	1  	173   	2.14456


  return left / right
  return left / right
  return left / right
  return left / right
  return left / right
  sil_samples = inter_clust_dists - intra_clust_dists
  return left / right
  return left / right
  return left / right
  sil_samples = inter_clust_dists - intra_clust_dists
  return left / right


9  	165   	0.264699  	9  	0.888934	-0.587617	165   	0.329732 	5.04   	9  	13 	1  	165   	2.12251


  return left / right
  return left / right
  return left / right
  return left / right


10 	154   	0.368169  	10 	0.907866	-0.587617	154   	0.346404 	5.11333	10 	17 	1  	154   	2.03482


  return left / right
  return left / right
  return left / right
  return left / right


11 	177   	0.392579  	11 	0.907866	-0.96    	177   	0.374604 	5.62333	11 	17 	1  	177   	2.20789


  return left / right
  return left / right
  return left / right


12 	169   	0.459602  	12 	0.92538 	-0.973333	169   	0.368494 	6.18667	12 	17 	1  	169   	2.34773


  return left / right
  return left / right
  return left / right
  return left / right
  return left / right


13 	158   	0.52603   	13 	0.92538 	-0.973333	158   	0.380087 	6.87   	13 	19 	1  	158   	2.30357


  return left / right
  return left / right
  return left / right
  return left / right
  return left / right


14 	167   	0.571259  	14 	0.92538 	-0.67904 	167   	0.361359 	7.17   	14 	15 	1  	167   	2.20932


  return left / right
  return left / right
  return left / right
  return left / right
  return left / right


15 	168   	0.626611  	15 	0.926214	-0.973333	168   	0.363399 	7.85333	15 	15 	2  	168   	2.2402 


  return left / right
  return left / right
  return left / right
  return left / right
  return left / right


16 	153   	0.662451  	16 	0.92538 	-0.859422	153   	0.342788 	8.15   	16 	18 	2  	153   	2.38205


  return left / right
  return left / right
  return left / right
  return left / right
  return left / right


17 	161   	0.655333  	17 	0.92538 	-0.96    	161   	0.360656 	8.3    	17 	19 	1  	161   	2.34023


  return left / right
  return left / right
  return left / right
  return left / right
  return left / right
  return left / right


18 	168   	0.645969  	18 	0.92538 	-0.96    	168   	0.384235 	8.35667	18 	17 	1  	168   	2.25006


  return left / right
  return left / right


19 	140   	0.685821  	19 	0.92538 	-0.96    	140   	0.379616 	8.28   	19 	16 	3  	140   	2.29236


  return left / right
  return left / right
  return left / right
  return left / right
  return left / right


20 	147   	0.681264  	20 	0.92538 	-0.933333	147   	0.360702 	8.26667	20 	20 	1  	147   	2.4931 


In [15]:
print(pop[-10])
print(hof.items[0])

mul(sub(protectedDiv(petal_width2, petal_width1), petal_width1), petal_width2)
mul(sub(mul(protectedCos(1), protectedDiv(petal_length2, sepal_length1)), petal_width1), sub(petal_width2, petal_width1))


#### results

In [16]:
from Classifiers import *

In [17]:
# metric = EuclideanMetric()
# classifier = KNNWrapper(metric)
# evaluator = Evaluator()

# evaluator.crossValidateClassifier(iris_dataset, classifier)