In [1]:
import os
from collections import defaultdict

import sympy as sp
from tqdm.notebook import tqdm

import pandas as pd
import seaborn as sns
import numpy as np

In [2]:
df_gplearn = pd.read_excel(
    "../logs/gplearn_2022-06-17_14-14-12.xlsx", 
    engine='openpyxl', 
    usecols="B:H"
)

df_dsr = pd.read_excel(
    "../logs/dsr_2022-06-17_15-26-36.xlsx", 
    engine='openpyxl',
    usecols="B:H"
)

In [3]:
len(df_gplearn[df_gplearn.accuracy > 0.95])/len(df_gplearn)

0.38461538461538464

In [4]:
len(df_dsr[df_dsr.accuracy > 0.95])/len(df_dsr)

0.5961538461538461

In [5]:
df_gplearn.iloc[:5]

Unnamed: 0,model_name,equation,number_of_points,hyper_parameter,accuracy,time,predicted_equation
0,gplearn,exp(-x_1**2/2)/sqrt(2*pi),500,Tournament size=10,0.636,11.547687,1/(exp(X0) + 1.48586931755139*exp(X0**2 - X0))
1,gplearn,exp(-(x_2/x_1)**2/2)/(sqrt(2*pi)*x_1),500,Tournament size=10,0.208,11.80459,0.210177540062669/X0
2,gplearn,exp(-((x_2-x_2)/x_1)**2/2)/(sqrt(2*pi)*x_1),500,Tournament size=10,1.0,12.758329,0.398748040747538*I/X2
3,gplearn,x_1/sqrt(1-x_2**2/x_3**2),500,Tournament size=10,0.928,14.210814,X2 + 1/(-X2 + exp(sqrt(X1)))
4,gplearn,x_1*x_2,500,Tournament size=10,1.0,1.72356,X0*X1


### Ideation 

For each equation predicted by a model, create a frequency count of the operators used by the model. Top `k` operators are chosen from the frequency dict.

```json
{
    id: {
        "model_1":{...set of operators used}
        "model_2":{...set of operators used}
    }
}
```

In [6]:
def sort_freq_dict(freqDict:dict) -> dict:
    sorted_freq_tuple = sorted(freqDict.items(), key=lambda item: item[1], reverse=True)
    return dict(sorted_freq_tuple)

def combine_freq_dicts(freq_dicts:list) -> dict:
    """Returns a combined frequency dictionary from input frequency dicts"""
    
    keys = []
    for freq_dict in freq_dicts:
        keys.extend(freq_dict.keys())
    
    keys = set(keys)  
    
    result = defaultdict(int)
    
    for key in keys:
        for freq_dict in freq_dicts:
            v = freq_dict.get(key)
            if v is not None:
                result[key] += v
        
    return dict(result)

def compute_acc_weight(acc):
    return np.exp(10*acc-10)

def compute_weighted_op_count(opp_acc_list) -> list:
    
    weighted_opp_acc_list = []
    
    for ops_count, acc in opp_acc_list:
        
        w = compute_acc_weight(acc)
        
        for k, v in ops_count.items():
            ops_count[k] = v*w
        
        weighted_opp_acc_list.append(ops_count)
        
    return weighted_opp_acc_list

In [122]:
def get_op_freq(eq) -> dict:
    """Returns the frequency for each operator as dict"""
    
    ops_string = str(eq.count_ops(visual=True))
    ops_iterable = [op.strip() for op in ops_string.split("+")]

    result = {}

    for ops in ops_iterable:

        args = ops.split("*")
        if len(args) == 1: # occurs once
            op_name = args[0].lower()
            result[op_name] = 1
            continue

        freq, op_name = int(args[0]), args[1].lower()
        result[op_name] = freq

    return result    

# Algorithm

1. compute weight,w, for each equation; w = exp(10x-10)
2. Multiply the weight with the freq
3. Combine the values of each op
4. Sort the dict
5. Choose top 5

In [129]:
k = 4
cols = ["equation", "predicted_equation", "model_name", "accuracy"]
iterator = enumerate(zip(df_dsr[cols].iterrows(), df_gplearn[cols].iterrows()))

ensemble_df = pd.DataFrame(columns=["equation", "ops", "top_k"])

for idx, model_results in iterator:
    
    consolidated_ops_count = []
    
    for _, result in model_results:
        
        sympy_eq = sp.simplify(sp.sympify(result.predicted_equation))        
        ops_count = get_op_freq(sympy_eq)
        
        ops_acc = (ops_count, result.accuracy)
        
        consolidated_ops_count.append(ops_acc)
        
    weighted_ops_count = compute_weighted_op_count(consolidated_ops_count)

    consolidated_ops = combine_freq_dicts(weighted_ops_count)
    
    result_ops = sorted(consolidated_ops, key=consolidated_ops.get, reverse=True)[:k]
    ops = ','.join(result_ops)
    print(f'Equation {idx}: {ops}')
    
    row = {
        "equation": model_results[0][1].equation,
        "ops": ops,
        "top_k": k,
    }
    
    ensemble_df = ensemble_df.append(row, ignore_index=True)

Equation 0: mul,add,log,exp
Equation 1: mul,add,div,log
Equation 2: div,exp,sin,mul
Equation 3: div,add,mul,pow
Equation 4: mul,exp,log
Equation 5: mul,exp,div,add
Equation 6: mul
Equation 7: mul
Equation 8: add,div,pow,mul
Equation 9: mul,div,add,sub
Equation 10: mul,sub,div,exp
Equation 11: mul,sin
Equation 12: div
Equation 13: sin,mul,cos,pow
Equation 14: div,log,add,exp
Equation 15: div
Equation 16: div,mul,add,cos
Equation 17: mul,div,pow,add
Equation 18: mul,log,pow,div
Equation 19: mul,add,log,exp
Equation 20: mul,log,div,add
Equation 21: add,cos,mul,sub
Equation 22: mul,div,add,pow
Equation 23: add,mul,pow,div
Equation 24: mul
Equation 25: div,mul,exp,log
Equation 26: mul,add,pow,log
Equation 27: mul,neg,log,div
Equation 28: mul,add,log,div
Equation 29: mul,log,div,add
Equation 30: mul,add,sub,exp
Equation 31: div,mul,add,pow
Equation 32: mul,exp,div,add
Equation 33: sub,pow,div,mul
Equation 34: mul,add,pow,log
Equation 35: mul,cos,neg,div
Equation 36: mul,cos,neg,add
Equation 

In [131]:
ensemble_df.to_excel("../results/top_4_ops_gp-dsr-heuristic-exp.xlsx")