# 1. Configuration

### Imports

In [None]:
import pygad
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd 
import random
import math
import time
from sympy import symbols, simplify

### Read the data

In [None]:
dataset = pd.read_csv("dataset.csv")
dataset

# 2. Representation

In [None]:
maxEquationLength = 25

#### Helper functions

Convert a `string equation` into an `array of tokens`.

In [None]:
def stringEQtoArray(equation):
    arr = np.array([])
    skip = 0

    for index, char in enumerate(equation):
        if skip != 0:
            skip-=1
            continue
    
        if char == ' ':
            continue
        elif char == 'x' or char.isdigit():
            arr = np.append(arr, char)
        elif char == '*' and equation[index+1] == '*':
            arr = np.append(arr, '**')
            skip = 1
        elif char in ['+', '-'] and equation[index+1].isdigit():
            if char == '+':
                arr = np.append(arr, equation[index+1])
            else:
                arr = np.append(arr, char+equation[index+1])
            skip = 1
        elif char in ['+', '-', '*', '/', '(', ')']:
            arr = np.append(arr, char)

    return arr

Convert an `array of tokens` to a `string equation`.

In [None]:
def arrayEQtoString(equation):
    string = ""
    for char in equation:
        string += char
        string += " "
    return string

Convert a string sequance to an `array`.

In [None]:
def stringArrayToArray(string):
    string = string[1:-1]
    arr = string.split(',')
    arr = [float(i) for i in arr]
    arr = np.array(arr)
    return arr

Evaluates `x_values` of the `equation`.

In [None]:
def evaluteEquation(equation, x_values):

    result = np.array([])
    for x in range(1, x_values+1):
        result = np.append(result, eval(equation, {'x': x}, {'math': math}))
    return result

Testing

In [None]:
#vse dela prou :)

for x in range(0,98):
    equation_inputs = dataset.iloc[x].values[0]
    
    outputs = stringArrayToArray(dataset.iloc[x].values[2])
    
    equationOutputs = evaluteEquation(equation_inputs, 100)

    #print(np.sum(outputs - equationOutputs))

    

Tests if the equation `is valid`.

In [None]:
def is_valid(solution):
    #solution is an array of tokens from the equation

    #if there is no x in the equation -> it is not valid
    if 'x' not in solution:
        return False
    
    #if there is no equation -> it is not valid
    if solution.__len__() > maxEquationLength:
        return False

    #try to evaluate the equation
    try:
        evaluteEquation(arrayEQtoString(solution), 100)
    except:
        return False

    return True

`Simplifies` the equation.

In [None]:
def simplify_equation(equation):
    
    x = symbols('x')

    equationString = arrayEQtoString(equation)

    equation = simplify(equationString)

    equation = stringEQtoArray(str(equation))
            
    return equation    

Transform an array of `chars` to a `int` array, and it's inverse (`int` -> `char`).

In [None]:
def char_to_int_array(array):
    new_array = []
    for i in array:
        if i == 'x':
            new_array.append(ord(i))
        elif i == '+':
            new_array.append(ord(i))
        elif i == '-':
            new_array.append(ord(i))
        elif i == '*':
            new_array.append(ord(i))
        elif i == '/':
            new_array.append(ord(i))
        elif i == '**':
            new_array.append(420)
        elif i == '(':
            new_array.append(ord(i))
        elif i == ')':
            new_array.append(ord(i))
        else:
            new_array.append(int(float(i)))
    return new_array

In [None]:
def int_array_to_char(array):
    new_array = []
    for i in array:
        if i == ord('x'):
            new_array.append('x')
        elif i == ord('+'):
            new_array.append('+')
        elif i == ord('-'):
            new_array.append('-')
        elif i == ord('*'):
            new_array.append('*')
        elif i == ord('/'):
            new_array.append('/')
        elif i == 420:
            new_array.append('**')
        elif i == ord('('):
            new_array.append('(')
        elif i == ord(')'):
            new_array.append(')')
        else:
            new_array.append(str(i))
    return new_array

Adds `pedding` to the int array, and it's inverse (removes padding).

In [None]:
def add_padding(array):
    new_array = np.zeros(maxEquationLength)
    for i in range(len(array)):
        new_array[i] = array[i]
    return new_array

In [None]:
def remove_padding(array):
    new_array = []
    for i in array:
        if i == 0:
            break
        new_array.append(int(i))
    return new_array

Puts the last two functions together.

In [None]:
def equation_for_GA(equation):
    equation = char_to_int_array(equation)
    equation = transform_array(equation, 25)
    return equation

In [None]:
def equation_from_GA(equation):
    equation = inverse_transform_array(equation)
    equation = int_array_to_char(equation)
    return equation

### 3. Genetic algorithm

In [None]:
#trying for the first equation first
eq_num = 2

true_equation = stringEQtoArray(dataset.iloc[eq_num].values[0])
inputs = stringArrayToArray(dataset.iloc[eq_num].values[1])
outputs = np.array(stringArrayToArray(dataset.iloc[eq_num].values[2]))

print(true_equation)
print(inputs)
print(outputs)

`Model` returns an array of function outputs and a equation length.

In [None]:
def model(equation):

    equation = equation_from_GA(equation)

    #rmaybe remove all the () from the equation for the length???
    # equation = [x for x in equation if x != '(' and x != ')']
    # equation = np.array(equation)
    equation_length = len(equation)

    equationString = arrayEQtoString(equation)
    equationOutputs = evaluteEquation(equationString, 100)
            
    return equationOutputs, equation_length


`Fitness function` returns the fitness of a given equation.

In [None]:
def fitness_func(ga_instance, solution, solution_idx):
    
    model_outputs,equation_length = model(solution)
  
    try:

        #change if needed, especialy equation length#

        error = np.sum(np.log10(np.abs(model_outputs - outputs) + 1)) + equation_length*0.1

        if error < 0:
            error = -error

    except:
        error = np.inf

    #print("Fitness: ", -error, "ga_instance.generation: ", ga_instance.generations_completed, "solution_idx: ", solution_idx)

    return -error

`Crossover function` receives `N parents` and returns ` N children`.

In [None]:
def crossover_func(parents, offspring_size, ga_instance):
    num_of_parents = parents.__len__()
    #note : 100 is the length of the equation
    array = np.empty((0, 25))

    for i in range(0,num_of_parents,2):
        #build a parent tree

        #print(1)

        parent1 = equation_from_GA(parents[i])
        parent2 = equation_from_GA(parents[i+1])

        #print(2)

        if not simpl_is_valid(parent1):
            parent1 = parent2

        #print(3)        
        if not simpl_is_valid(parent2):
            parent2 = parent1
        
        #print(4)

        parent1_tree = EquationTree()
        parent1_tree.build_tree(parent1)

       #print(5)

        parent2_tree = EquationTree()
        parent2_tree.build_tree(parent2)

        #print(6)

        parent1_subtree = parent1_tree.get_random_subtree()
        parent2_tree.replance_random_subtree(parent1_subtree)
        child = parent2_tree.get_array()

        #print(7, child)
        # print("GET ARRAY: ")
        # print(child)
        child = simplify_equation(child)
        # print(child)

        #print(8,child)
        if not is_valid(child):
            # print("NOT VALID")
            child = parent1

        #print(9,child)
        child = equation_for_GA(child)
        #add the child array to the array
        #print(10)
        child_array = np.array(child, dtype=int).reshape(1, -1)
        #print(11)
        array = np.vstack((array, child_array))
        #print(12)

    # print("Crossover: ")
    # for i in array:
    #     print(equation_from_GA(i))

    # print(ga_instance.generations_completed, ga_instance.population.__len__())
    # for i in ga_instance.population:
    #     print(equation_from_GA(i) , fitness_func(ga_instance, i, 0))

    return array

Try crossover.

`Mutation function ` receives N equations and returns N `mutated` equations.

In [None]:
def mutation_func(offspring, ga_instance):
    array = np.empty((0, maxEquationLength))

    for j in range(len(offspring)):

        eq_mutated = equation_from_GA(offspring[j])
  
        #Mutation type 1
        if np.random.rand() > 0.2:
            print("Mutation type 1")
            while True:
                
                i = random.randrange(len(eq_mutated))

                # changes a random operator
                if eq_mutated[i] in ['+', '-', '*', '/', '**']:              
                    eq_mutated[i] = np.random.choice(['+', '-', '*', '/', '**'])
                    break
                # changes a random number
                elif eq_mutated[i].isdigit():
                    random_number = str(np.random.randint(-10, 10))
                    if random_number == '0':
                        random_number = '1'
                    eq_mutated[i] = random_number
                    break
                #change the x to ( x + random number )
                elif eq_mutated[i] == 'x':
                    eq_mutated[i] = '('
                    random_number = str(np.random.randint(1, 10))
                    eq_mutated = np.insert(eq_mutated, i+1, ['x', '+', str(random_number), ')'])
                    break

        #Mutation type 2
        else:
            print("Mutation type 2")
            #if it doesnt have () on the start and the end add them
            if eq_mutated[0] != '(' and eq_mutated[-1] != ')':
                eq_mutated = np.append(['('], eq_mutated)
                eq_mutated = np.append(eq_mutated, [')'])

            #add a random operator and a random number on the start or the end
            if np.random.rand() > 0.5:
                #add on the start
                eq_mutated = np.insert(eq_mutated, 0, np.random.choice(['+', '-', '*', '/']))
                random_number = str(np.random.randint(1, 10))
                eq_mutated = np.insert(eq_mutated, 0, random_number)
            else:
                #add on the end
                eq_mutated = np.append(eq_mutated, np.random.choice(['+', '-', '*', '/']))
                random_number = str(np.random.randint(1, 10))
                eq_mutated = np.append(eq_mutated, random_number)

        #check if the equation is valid
        if not is_valid(eq_mutated):
            eq_mutated = equation_from_GA(offspring[j])


        eq_mutated = simplify_equation(eq_mutated)

        #check if the equation is valid
        if not is_valid(eq_mutated):
            eq_mutated = equation_from_GA(offspring[j])
    

        offspring_array = np.array(equation_for_GA(eq_mutated), dtype=int).reshape(1, -1)
        array = np.vstack((array, offspring_array)).astype(int)

    # print("Mutation: ")
    # for i in array:
    #     print(equation_from_GA(i))

    return array

Try mutation.

In [None]:
equation_from_GA(mutation_func(np.array([equation_for_GA(['x','+','1'])]), 1)[0])

Initialize the population.

In [None]:
pop1 = [['x','+', '1']] + [['x','+', '2']] +[['x','+', '3']] + [['x','+', '4']] + [['x','+', '5']] +[['x','+', '6']] + [['x','+', '7']] + [['x','+', '8']] + [['x','+', '9']]
pop2 = [['x','*', '1']] + [['x','*', '2']] +[['x','*', '3']] + [['x','*', '4']] + [['x','*', '5']]  + [['x','*', '6']] +[['x','*', '7']] + [['x','*', '8']] + [['x','*', '9']]
pop3 = [['x','**', '2']] +[['x','**', '3']] + [['x','**', '4']] + [['x','**', '5']] + [['x','**', '6']] +[['x','**', '7']] + [['x','**', '8']] + [['x','**', '9']]
pop4 = [['x','/', '2']] +[['x','/', '3']] + [['x','/', '4']] + [['x','/', '5']] + [['x','/', '6']] +[['x','/', '7']] + [['x','/', '8']] + [['x','/', '9']]

initial_population = pop1 + pop2 + pop3 + pop4

#transform the initial population to a int array
initial_population = [equation_for_GA(i) for i in initial_population]

gene_space = np.array([ord('x'), ord('+'), ord('-'), ord('*'), ord('/'), 420, 1, 2, 3, 4, 5, 6, 7, 8, 9,-1, -2, -3, -4, -5, -6, -7, -8, -9])

In [None]:
sol_per_pop = initial_population.__len__()
num_parents_mating = 20
keep_elitism = int(sol_per_pop - num_parents_mating/2)

print("sol_per_pop: ", sol_per_pop)
print("num_parents_mating: ", num_parents_mating)
print("keep_elitism: ", keep_elitism)

In [None]:
def printCrossover(x,y):
    print("Crossover")

def printMutation(x,y):
    print("Mutation")

def printFitness(x,y):
    print("Fitness")

def printParents(y,parents):
    print("Parents")

In [None]:
for i in range(20):   
    try:
        print("Equation: ", i)

        true_equation = stringEQtoArray(dataset.iloc[i].values[0])
        inputs = stringArrayToArray(dataset.iloc[i].values[1])
        outputs = np.array(stringArrayToArray(dataset.iloc[i].values[2]))


        ga_instance = pygad.GA(num_generations=200,
                            num_parents_mating=num_parents_mating,
                            fitness_func=fitness_func,
                            initial_population=initial_population,
                            gene_type=int,
                            parent_selection_type="tournament",
                            keep_elitism=keep_elitism,
                            crossover_probability=0.8,
                            crossover_type=crossover_func,
                            mutation_type=mutation_func,
                            mutation_probability=0.5,
                            gene_space=gene_space,
                            stop_criteria="saturate_100")
                            # on_crossover=printCrossover,
                            # on_mutation=printMutation,
                            # on_fitness=printFitness,
                            # on_parents=printParents)

        ga_instance.run()

        print(ga_instance.plot_fitness())

        solution, solution_fitness, solution_idx = ga_instance.best_solution()

        x = range(1, 101)

        # Assuming you have two output arrays of the same length
        y1 = outputs

        tree = EquationTree()
        tree.build_tree(equation_from_GA(solution))
        y2 = [tree.evaluate(i) for i in x]

        print("True equation: ", true_equation)
        print("Predicted equation: ", equation_from_GA(solution))

        # Plotting the first array
        plt.plot(x, y1, label='True equation')

        # Plotting the second array
        plt.plot(x, y2, label='Predicted equation')

        # Adding labels and title
        plt.xlabel('X-axis label')
        plt.ylabel('Y-axis label')
        plt.title('Two Arrays Plot')

        # Adding legend
        plt.legend()

        # Display the plot
        plt.show()

    except Exception as e:
        print("ERROR: ", e)
        continue


In [None]:
solution, solution_fitness, solution_idx = ga_instance.best_solution()
print("Best parameter", equation_from_GA(solution))
print("Fitness", solution_fitness)
print("Index", solution_idx)

In [None]:
print(ga_instance.plot_fitness())