# 1. Configuration

### Imports

In [None]:
import pygad
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd 
import random
import math
import time
from multiprocessing import Process, Value

### Read the data

In [None]:
dataset = pd.read_csv("dataset.csv")
dataset

# 2. Representation

### Tree data structure

#### Node class

In [None]:
class Node:
    def __init__(self, value=None):
        self.value = value
        self.children = []

    def add_child(self, child):
        self.children.append(child)

#### Equation tree class

The `EquationTree` class is a custom implementation of a binary tree data structure that represents a mathematical equation. It is used to parse and evaluate mathematical expressions in a tree-like structure. 

The class has the following methods:

- `build_tree(self, expression)`: Builds the tree structure from a given mathematical expression.
- `print_tree(self)`: Prints the tree structure in a readable format.
- `evaluate(self, x)`: Evaluates the mathematical expression represented by the tree structure for a given value of `x`.


In [None]:
class EquationTree:
    def __init__(self, root=None):
        self.root = root

    def build_tree(self, expression):
        self.root = self._build_tree_recursive(expression)

    def print_tree(self):
        self._print_tree_recursive(self.root)
    
    def evaluate(self, x):
        return self._evaluate_recursive(self.root, x)
     
    def get_array(self):
        return self._get_array_recursive(self.root)

    def replance_random_subtree(self, new_subtree):
        if self.root is None or self.root.children == []:
            self.root = new_subtree.root
        else:
            self._replace_random_subtree_recursive(self.root.children[random.randint(0, 1)], new_subtree.root)
    
    def get_random_subtree(self):
        if self.root is None:
            return None
        return self._get_random_subtree_recursive(self.root)

#recursive functions
        
    def _get_array_recursive(self, root):
        if root is None:
            return []

        array = []

        # If the node has grandchildren, surround the value with parentheses
        if root.children:
            array.append("(")

        if root.children:
            array += self._get_array_recursive(root.children[0])

        # Add the value of the root node
        array.append(root.value)

        # Add the right child recursively
        if root.children and len(root.children) > 1:
            array += self._get_array_recursive(root.children[1])

        # If the node has children, close the parentheses
        if root.children:
            array.append(")")

        return array

    def _get_random_subtree_recursive(self, current_node):
        if not current_node.children:
            return EquationTree(current_node) 

        if random.choice([True, False]):
            return self._get_random_subtree_recursive(random.choice(current_node.children))
        else:
            return EquationTree(current_node)

    def _replace_random_subtree_recursive(self, current_node, new_subtree_root):
        if not current_node.children:
            current_node.value = new_subtree_root.value
            current_node.children = new_subtree_root.children
            return

        if random.choice([True, False]):
            self._replace_random_subtree_recursive(random.choice(current_node.children), new_subtree_root)
        else:
            current_node.value = new_subtree_root.value
            current_node.children = new_subtree_root.children

    def _evaluate_recursive(self, root, x):
        if root is None:
            return None

        if root.value == 'x':
            return x
        elif root.value.lstrip('-').isdigit():
            return int(root.value)
        elif root.value == '+':
            return self._evaluate_recursive(root.children[0], x) + self._evaluate_recursive(root.children[1], x)
        elif root.value == '-':
            return self._evaluate_recursive(root.children[0], x) - self._evaluate_recursive(root.children[1], x)
        elif root.value == '*':
            return self._evaluate_recursive(root.children[0], x) * self._evaluate_recursive(root.children[1], x)
        elif root.value == '/':
            return self._evaluate_recursive(root.children[0], x) / self._evaluate_recursive(root.children[1], x)
        elif root.value == '**':
            return self._evaluate_recursive(root.children[0], x) ** self._evaluate_recursive(root.children[1], x)
        else:
            return None
       
    def _print_tree_recursive(self, root, depth=0):
        if root is None:
            return

        print('\t' * depth + str(root.value) + "--------")

        for child in root.children:
            self._print_tree_recursive(child, depth + 1)

    def _build_tree_recursive(self, expression):
        
        current_depth = -1  #odvisn v kok ( gres notr

        lowest_depth = 0    #najvecja globina
        lowest_depth_index = -1 #index najvecje globine, oz tm k je element d splitamo drevo

        leaf = False    #ce je list pa v izrazu ni vec operatorjev pol vrne list
        leaf_value = None   #vrednost lista

        priority_lvl = 0   #to je zto d ma ** prednost pred * in /, pa + in - pred tem

        for i in range(len(expression)):
            if expression[i] == '(':    
                current_depth += 1  #povecamo globino
            elif expression[i] == ')':
                current_depth -= 1  #zmanjsamo globino
            elif expression[i] in ['+', '-']:
                #   ce smo v najmajsi globini    ce sploh ni globine v enacbi    prednost pred drugimi operatorji (kle nima tok veze, bl je spodi)
                if current_depth < lowest_depth or lowest_depth_index == -1 or priority_lvl > 1:  
                    lowest_depth = current_depth
                    lowest_depth_index = i
                    priority_lvl = 1 
            elif expression[i] in ['*', '/']:
                if current_depth < lowest_depth or lowest_depth_index == -1 or priority_lvl > 2:
                    lowest_depth = current_depth
                    lowest_depth_index = i
                    priority_lvl = 2
            elif expression[i] in ['**']:
                if current_depth < lowest_depth or lowest_depth_index == -1 or priority_lvl > 3:
                    lowest_depth = current_depth
                    lowest_depth_index = i
                    priority_lvl = 3
            elif expression[i] == 'x' or expression[i].lstrip('-').isdigit():
                leaf_value = expression[i]
                leaf = True

        if lowest_depth_index == -1 and leaf:
            return Node(leaf_value)
        else:
            root = Node(expression[lowest_depth_index])
            # print(expression[:lowest_depth_index])
            # print(expression[lowest_depth_index+1:])
            root.add_child(self._build_tree_recursive(expression[:lowest_depth_index])) #rekurzivno zgradimo levo poddrevo
            root.add_child(self._build_tree_recursive(expression[lowest_depth_index+1:]))   #rekurzivno zgradimo desno poddrevo

            return root
        

#### Helper functions

Helper function to convert a string equation into an array of `tokens`.

In [None]:
def stringEQtoArray(equation):
    arr = np.array([])
    skip = 0

    for index, char in enumerate(equation):
        if skip != 0:
            skip-=1
            continue
    
        if char == ' ':
            continue
        elif char == 'x' or char.isdigit():
            arr = np.append(arr, char)
        elif char == '*' and equation[index+1] == '*':
            arr = np.append(arr, '**')
            skip = 1
        elif char in ['+', '-'] and equation[index+1].isdigit():
            if char == '+':
                arr = np.append(arr, equation[index+1])
            else:
                arr = np.append(arr, char+equation[index+1])
            skip = 1
        elif char in ['+', '-', '*', '/', '(', ')']:
            arr = np.append(arr, char)

    return arr

Helper function to convert a string sequance array to an array.

In [None]:
def stringArrayToArray(string):
    string = string[1:-1]
    arr = string.split(',')
    arr = [float(i) for i in arr]
    return arr

In [None]:
#vse dela prou :)

for x in range(0,0):
    equation_inputs = dataset.iloc[x].values[0]
    #print(equation_inputs)
    equation = stringEQtoArray(equation_inputs)
    print(equation)
    equation_tree = EquationTree()
    equation_tree.build_tree(equation) 
    equation_tree.print_tree()

    subtree = equation_tree.get_random_subtree()
    print(subtree.get_array())
    subtree.print_tree()

    equation_tree.replance_random_subtree(subtree)
    print(equation_tree.get_array())

    outputs = stringArrayToArray(dataset.iloc[x].values[2])
    sum = 0
    for i in range(100):
        sum += (equation_tree.evaluate(i+1) - outputs[i])

    #print(sum) #more bit 0
    

In [None]:
def is_valid(solution):

    
    #if there is no x in the equation -> it is not valid
    if 'x' not in solution:
        return False
    
    if solution.__len__() > 25:
        return False

    if solution.__len__() == 0:
        return False

    #if there is an '1e-' in the equation -> it is not valid
    if '1e-' in solution:
        return False

    #if there is x ** x in the equation -> it is not valid
    if 'x' in solution and '**' in solution:
        for i in range(solution.__len__()-2):
            if solution[i] == 'x' and solution[i+1] == '**' and solution[i+2] == 'x':
                return False

    try:
        equation_tree = EquationTree()
        equation_tree.build_tree(solution) 
        for i in range(100):

            start_time = time.time()
            str(equation_tree.evaluate(i+1))
            elapsed_time = time.time() - start_time
            if elapsed_time > 0.1:
                return False

    except:
        return False

    

    return True

In [None]:
is_valid(['(', 'x', '+', '-4', ')', '/', '(', '-5', '**', '(', 'x', '**', '4', ')', ')'])

In [None]:
tree = EquationTree()
tree.build_tree(['(', 'x', '+', '-4', ')', '/', '(', '-5', '**', '(', 'x', '**', '4', ')', ')'])

for i in range(100):
    print(i)
    startT = time.time()
    #print(tree.evaluate(i+1))
    print(time.time() - startT)

In [None]:
def simpl_is_valid(solution):
    try:
        equation_tree = EquationTree()
        equation_tree.build_tree(solution) 
    except:
        return False

    if solution.__len__() == 0:
        return False
    
    return True

In [None]:
def simplify_equation(equation):
    
    try:
        simplified = False
        while not simplified:
            remove = False
            #remove () if they are on the start and end of the equation and if there is no operator in the first lvl of ()
            if equation[0] == '(' and equation[-1] == ')':
                remove = True
            lvl = 0 

            for i in range(len(equation)):

                if equation[i] == '(':
                    lvl += 1
                if equation[i] == ')':
                    lvl -= 1
                if lvl == 0 and equation[i] in ['+','-','*','/','**']:
                    remove = False

                if i+2 < len(equation):
                    # 1**anything = 1
                    if equation[i] == '1' and equation[i+1] == '**':
                        if equation[i+2] == '(':
                            equation[i+1] = ''
                            equation[i+2] = ''
                            i += 1
                            lvl = 1
                            while equation[i+2] != ')' and lvl != 0:
                                if equation[i+2] == '(':
                                    lvl += 1
                                if equation[i+2] == ')':
                                    lvl -= 1
                                equation[i+2] = ''
                                i += 1
                            equation[i+2] = ''
                        else:
                            equation[i+1] = ''
                            equation[i+2] = ''

                if i+1 < len(equation):
                    #x**1 = x, x*1 = x, x/1 = x
                    if equation[i] in ['**','*','/'] and equation[i+1] == '1':
                        equation[i] = ''
                        equation[i+1] = ''

                if equation[i] == 'x' and i+2 < len(equation):
                    
                    # x/x = 1
                    if equation[i+1] == '/' and equation[i+2] == 'x':
                        equation[i] = '1'
                        equation[i+1] = ''
                        equation[i+2] = ''

                # (x) = x, (1) = 1,...
                elif equation[i] == '(' and i+2 < len(equation):
                    if equation[i+2] == ')':
                        equation[i] = ''
                        equation[i+2] = ''
                        lvl -= 1

                elif (equation[i].isdigit() or equation[i][1:].isdigit()) and i+2 < len(equation):
                    if equation[i+2].isdigit() or equation[i+2][1:].isdigit():
                        if equation[i+1] == '+':
                            equation[i] = str(int(equation[i]) + int(equation[i+2]))
                            equation[i+1] = ''
                            equation[i+2] = ''
                        elif equation[i+1] == '-':
                            equation[i] = str(int(equation[i]) - int(equation[i+2]))
                            equation[i+1] = ''
                            equation[i+2] = ''
                        elif equation[i+1] == '*':
                            equation[i] = str(int(equation[i]) * int(equation[i+2]))
                            equation[i+1] = ''
                            equation[i+2] = ''
                        elif equation[i+1] == '/':
                            equation[i] = str(int(equation[i]) / int(equation[i+2]))
                            equation[i+1] = ''
                            equation[i+2] = ''
                        elif equation[i+1] == '**':
                            result = int(equation[i]) ** int(equation[i+2])
                            if result < 1 and result > 0:
                                equation[i] = str(1)
                            else:
                                equation[i] = str(result)
                            equation[i+1] = ''
                            equation[i+2] = ''

                    
            if remove: 
                equation = equation[1:-1]
                
                            
            #remove empty strings
            if '' in equation:
                equation = list(filter(None, equation))
            else:
                simplified = True
                equation = list(filter(None, equation))

    except:
        return ['x']
            
    return equation    

In [None]:
#transform a array of chars to a int array
def char_to_int_array(array):
    new_array = []
    for i in array:
        if i == 'x':
            new_array.append(ord(i))
        elif i == '+':
            new_array.append(ord(i))
        elif i == '-':
            new_array.append(ord(i))
        elif i == '*':
            new_array.append(ord(i))
        elif i == '/':
            new_array.append(ord(i))
        elif i == '**':
            new_array.append(420)
        elif i == '(':
            new_array.append(ord(i))
        elif i == ')':
            new_array.append(ord(i))
        else:
            new_array.append(int(float(i)))
    return new_array

#transform a int array to a array of chars
def int_array_to_char(array):
    new_array = []
    for i in array:
        if i == ord('x'):
            new_array.append('x')
        elif i == ord('+'):
            new_array.append('+')
        elif i == ord('-'):
            new_array.append('-')
        elif i == ord('*'):
            new_array.append('*')
        elif i == ord('/'):
            new_array.append('/')
        elif i == 420:
            new_array.append('**')
        elif i == ord('('):
            new_array.append('(')
        elif i == ord(')'):
            new_array.append(')')
        else:
            new_array.append(str(i))
    return new_array

print(char_to_int_array(['x', '+', '-0']))
print(int_array_to_char([120, 43, -0]))


In [None]:
# Transform the int array from size n to fixed size of "size" by adding 0 to the end of the array
def transform_array(array, size):
    new_array = np.zeros(size)
    for i in range(len(array)):
        new_array[i] = array[i]
    return new_array

# Inverse
def inverse_transform_array(array):
    new_array = []
    for i in array:
        if i == 0:
            break
        new_array.append(int(i))
    return new_array


print(transform_array(char_to_int_array(['x', '+', '1']), 25))
print(inverse_transform_array(transform_array(char_to_int_array(['x', '+', '1']),25)))

In [None]:
# put the functions together
def equation_for_GA(equation):
    equation = char_to_int_array(equation)
    equation = transform_array(equation, 25)
    return equation

def equation_from_GA(equation):
    equation = inverse_transform_array(equation)
    equation = int_array_to_char(equation)
    return equation

print(equation_for_GA(['x', '+', '1']))
print(equation_from_GA(equation_for_GA(['x', '+', '1'])))

### 3. Genetic algorithm

In [None]:
#trying for the first equation first
eq_num = 2

true_equation = stringEQtoArray(dataset.iloc[eq_num].values[0])
inputs = stringArrayToArray(dataset.iloc[eq_num].values[1])
outputs = np.array(stringArrayToArray(dataset.iloc[eq_num].values[2]))

print(true_equation)
print(inputs)
print(outputs)

In [None]:
def model(equation):

    #print("m", equation_from_GA(equation))

    if not is_valid(equation_from_GA(equation)):
        return np.zeros(100),0

    #print("not this")

    equation = equation_from_GA(equation)

    equation_tree = EquationTree()
    equation_tree.build_tree(equation)

    #remove all the () from the equation
    equation = [x for x in equation if x != '(' and x != ')']
    equation = np.array(equation)

    #get the length of the equation
    equation_length = len(equation)
            
    return np.array([equation_tree.evaluate(i+1) for i in range(100)]), equation_length


In [None]:
def fitness_func(ga_instance, solution, solution_idx):
    model_outputs,equation_length = model(solution)

    model_outputs = np.array(model_outputs)

    
    try:
        error = np.sum(np.log10(np.abs(model_outputs - outputs) + 1)) + np.log(equation_length)

        if error < 0:
            error = -error

    except:
        error = np.inf

    #print("Fitness: ", -error, "ga_instance.generation: ", ga_instance.generations_completed, "solution_idx: ", solution_idx)

    return -error

In [None]:
def crossover_func(parents, offspring_size, ga_instance):
    num_of_parents = parents.__len__()
    #note : 100 is the length of the equation
    array = np.empty((0, 25))

    for i in range(0,num_of_parents,2):
        #build a parent tree

        #print(1)

        parent1 = equation_from_GA(parents[i])
        parent2 = equation_from_GA(parents[i+1])

        #print(2)

        if not simpl_is_valid(parent1):
            parent1 = parent2

        #print(3)        
        if not simpl_is_valid(parent2):
            parent2 = parent1
        
        #print(4)

        parent1_tree = EquationTree()
        parent1_tree.build_tree(parent1)

       #print(5)

        parent2_tree = EquationTree()
        parent2_tree.build_tree(parent2)

        #print(6)

        parent1_subtree = parent1_tree.get_random_subtree()
        parent2_tree.replance_random_subtree(parent1_subtree)
        child = parent2_tree.get_array()

        #print(7, child)
        # print("GET ARRAY: ")
        # print(child)
        child = simplify_equation(child)
        # print(child)

        #print(8,child)
        if not is_valid(child):
            # print("NOT VALID")
            child = parent1

        #print(9,child)
        child = equation_for_GA(child)
        #add the child array to the array
        #print(10)
        child_array = np.array(child, dtype=int).reshape(1, -1)
        #print(11)
        array = np.vstack((array, child_array))
        #print(12)

    # print("Crossover: ")
    # for i in array:
    #     print(equation_from_GA(i))

    # print(ga_instance.generations_completed, ga_instance.population.__len__())
    # for i in ga_instance.population:
    #     print(equation_from_GA(i) , fitness_func(ga_instance, i, 0))

    return array

In [None]:
def mutation_func(offspring, ga_instance):
    array = np.empty((0, 25))

    for j in range(len(offspring)):

        #print("c",1)

        eq_mutated = equation_from_GA(offspring[j])

        #print("c",2)

        if not simpl_is_valid(eq_mutated):
            eq_mutated = ['x']
    
        #print("c",3)
    
        while True:
            #if there are no operators or numbers in the equation -> it is not valid
            i = random.randrange(len(eq_mutated))
            if eq_mutated[i] in ['+', '-', '*', '/', '**']:
                eq_mutated[i] = np.random.choice(['+', '-', '*', '/', '**'])
                break
            elif eq_mutated[i].isdigit():
                random_number = str(np.random.randint(-10, 10))
                if random_number == '0':
                    random_number = '1'
                eq_mutated[i] = random_number
                break
            elif eq_mutated[i] == 'x':
                #change the x to ( and add x + 1 ) after the detected x position
                eq_mutated[i] = '('
                random_number = str(np.random.randint(-10, 10))
                if random_number == '0':
                    random_number = '1'
                eq_mutated = np.insert(eq_mutated, i+1, ['x', '+', random_number, ')'])
                break
            
        #print("c",4)

        # if np.random.rand() > 0.9:
        
        #     #if it doesnt have () on the start and the end add them
        #     if eq_mutated[0] != '(' and eq_mutated[-1] != ')':
        #         eq_mutated = np.append(['('], eq_mutated)
        #         eq_mutated = np.append(eq_mutated, [')'])

        #     eq_mutated = np.append(eq_mutated, np.random.choice(['+', '-', '*', '/', '**']))
        #     random_number = str(np.random.randint(-10, 10))
        #     if random_number == '0':
        #         random_number = '1'
        #     eq_mutated = np.append(eq_mutated, random_number)

        #print("c",5,eq_mutated)
 
        if not is_valid(eq_mutated):
            eq_mutated = equation_from_GA(offspring[j])

        #print("c",6)

        eq_mutated = simplify_equation(eq_mutated)

        #print("c",7)

        if not is_valid(eq_mutated):
            eq_mutated = equation_from_GA(offspring[j])
    
        #print("c",8)
        offspring_array = np.array(equation_for_GA(eq_mutated), dtype=int).reshape(1, -1)
        array = np.vstack((array, offspring_array)).astype(int)

    # print("Mutation: ")
    # for i in array:
    #     print(equation_from_GA(i))

    return array

In [None]:
mut = mutation_func(np.array([equation_for_GA(['x', '+', 'x'])]), 0)
print(mut)
print(equation_from_GA(mut[0]))

In [None]:
pop1 = [['x','+', '1']] + [['x','+', '2']] +[['x','+', '3']] + [['x','+', '4']] + [['x','+', '5']] +[['x','+', '6']] + [['x','+', '7']] + [['x','+', '8']] + [['x','+', '9']]
pop2 = [['x','*', '1']] + [['x','*', '2']] +[['x','*', '3']] + [['x','*', '4']] + [['x','*', '5']]  + [['x','*', '6']] +[['x','*', '7']] + [['x','*', '8']] + [['x','*', '9']]
pop3 = [['x','**', '2']] +[['x','**', '3']] + [['x','**', '4']] + [['x','**', '5']] + [['x','**', '6']] +[['x','**', '7']] + [['x','**', '8']] + [['x','**', '9']]
pop4 = [['x','/', '2']] +[['x','/', '3']] + [['x','/', '4']] + [['x','/', '5']] + [['x','/', '6']] +[['x','/', '7']] + [['x','/', '8']] + [['x','/', '9']]

initial_population = pop1 + pop2 + pop3 + pop4

#transform the initial population to a int array
initial_population = [equation_for_GA(i) for i in initial_population]

gene_space = np.array([ord('x'), ord('+'), ord('-'), ord('*'), ord('/'), 420, 1, 2, 3, 4, 5, 6, 7, 8, 9,-1, -2, -3, -4, -5, -6, -7, -8, -9])

In [None]:
sol_per_pop = initial_population.__len__()
num_parents_mating = 20
keep_elitism = int(sol_per_pop - num_parents_mating/2)

print("sol_per_pop: ", sol_per_pop)
print("num_parents_mating: ", num_parents_mating)
print("keep_elitism: ", keep_elitism)

In [None]:
def printCrossover(x,y):
    print("Crossover")

def printMutation(x,y):
    print("Mutation")

def printFitness(x,y):
    print("Fitness")

def printParents(y,parents):
    print("Parents")

In [None]:
for i in range(20):   
    try:
        print("Equation: ", i)

        true_equation = stringEQtoArray(dataset.iloc[i].values[0])
        inputs = stringArrayToArray(dataset.iloc[i].values[1])
        outputs = np.array(stringArrayToArray(dataset.iloc[i].values[2]))


        ga_instance = pygad.GA(num_generations=200,
                            num_parents_mating=num_parents_mating,
                            fitness_func=fitness_func,
                            initial_population=initial_population,
                            gene_type=int,
                            parent_selection_type="tournament",
                            keep_elitism=keep_elitism,
                            crossover_probability=0.8,
                            crossover_type=crossover_func,
                            mutation_type=mutation_func,
                            mutation_probability=0.5,
                            gene_space=gene_space,
                            stop_criteria="saturate_100")
                            # on_crossover=printCrossover,
                            # on_mutation=printMutation,
                            # on_fitness=printFitness,
                            # on_parents=printParents)

        ga_instance.run()

        print(ga_instance.plot_fitness())

        solution, solution_fitness, solution_idx = ga_instance.best_solution()

        x = range(1, 101)

        # Assuming you have two output arrays of the same length
        y1 = outputs

        tree = EquationTree()
        tree.build_tree(equation_from_GA(solution))
        y2 = [tree.evaluate(i) for i in x]

        print("True equation: ", true_equation)
        print("Predicted equation: ", equation_from_GA(solution))

        # Plotting the first array
        plt.plot(x, y1, label='True equation')

        # Plotting the second array
        plt.plot(x, y2, label='Predicted equation')

        # Adding labels and title
        plt.xlabel('X-axis label')
        plt.ylabel('Y-axis label')
        plt.title('Two Arrays Plot')

        # Adding legend
        plt.legend()

        # Display the plot
        plt.show()

    except Exception as e:
        print("ERROR: ", e)
        continue


In [None]:
solution, solution_fitness, solution_idx = ga_instance.best_solution()
print("Best parameter", equation_from_GA(solution))
print("Fitness", solution_fitness)
print("Index", solution_idx)

In [None]:
print(ga_instance.plot_fitness())