In [27]:
import numpy as np
import pandas as pd
import random as rd
import time

## Generate input data

The parameters can be adjusted to increase/decrease the workload.

In [3]:
N = 100000
FEATURES = 15

cols = "abcdefghijkmnopqrstuv"
columns = list(cols)[:FEATURES]

x = np.random.rand(N, FEATURES)

df = pd.DataFrame(x, columns = columns)
df["y"] = np.sin(df["a"].values) + np.cos(df["b"].values) + np.random.rand(N) * 0.001

df.to_csv("data.csv")


unary_funs = ["sinf", "cosf", "sqrtf"]
operators = ["+", "-"]

def random_program(depth=4):
    r = rd.randint(0,100)
    if depth == 0 or r < 30:
        c = rd.choice(columns)
        return f"_{c}_"
    elif r < 80:
        c = rd.choice(unary_funs)
        r = random_program(depth-1)
        return f"{c}({r})"
    else:
        c = rd.choice(operators)
        r1 = random_program(depth-1)
        r2 = random_program(depth-1)
        return f"({r1}) {c} ({r2})"


with open("functions.txt", "w") as f:
    for _ in range(1000):
        f.write(random_program() + "\n")

In [17]:
df = pd.read_csv("data.csv")
display(df.head())

Unnamed: 0.1,Unnamed: 0,a,b,c,d,e,f,g,h,i,j,k,m,n,o,p,y
0,0,0.701472,0.553664,0.006426,0.908177,0.565447,0.772378,0.066453,0.522644,0.099342,0.751865,0.206676,0.391573,0.714122,0.974487,0.789062,1.496423
1,1,0.438147,0.908836,0.088304,0.964788,0.508435,0.016098,0.021518,0.454406,0.218316,0.899252,0.090579,0.153759,0.274397,0.794021,0.78484,1.039112
2,2,0.18035,0.604814,0.883602,0.91887,0.786991,0.101334,0.92472,0.988423,0.698398,0.101556,0.918204,0.333705,0.757223,0.279271,0.669572,1.002524
3,3,0.220502,0.891454,0.568761,0.468157,0.025218,0.870858,0.633421,0.324128,0.74345,0.483112,0.810347,0.226594,0.132563,0.621351,0.896972,0.847733
4,4,0.141556,0.526536,0.08985,0.623158,0.951209,0.648823,0.789727,0.996261,0.411132,0.496707,0.597377,0.595296,0.877285,0.492387,0.955452,1.006028


## Run sequential version

In [None]:
funs = [ line.strip() for line in open("functions.txt").readlines() ]

def score(line):
    # Use np.emath.sqrt to handle negative inputs by returning complex numbers
    line = line.replace("sqrtf", "np.emath.sqrt")

    # Replace other unary functions with their NumPy equivalents
    for u in ["sinf", "cosf", "tanf", "expf"]:
        line = line.replace(u, f"np.{u[:-1]}")
    
    # Replace column placeholders with actual DataFrame column values
    for c in df.columns:
        line = line.replace(f"_{c}_", f"(df[\"{c}\"].values)")
    
    # Evaluate the expression
    try:
        a = eval(line)
        b = df["y"]
        # Calculate Mean Squared Error using the magnitude of the difference
        # np.abs() handles complex numbers correctly (returns magnitude)

        # Now it's sligly slower (+ (0.2, 0.4) seconds), but handles complex results properly
        e = np.mean(np.abs(a - b)**2)
        return e
    except Exception:
        return float('inf')

l = funs[0]
print(score(l), l)

t0 = time.time()
scores = [(score(line), line) for line in funs]
t1 = time.time()

print(f"\nTime taken: {t1 - t0} seconds\n")

scores.sort()
print(f"Best score {scores[0][0]}: {scores[0][1]}")
print(f"Worst score {scores[-1][0]}: {scores[-1][1]}")

0.37730532071334005 cosf(_a_)

Time taken: 1.9374911785125732 seconds

Best score 0.060574792608106345: sqrtf((_a_) + ((_c_) + (sqrtf(_i_))))
Worst score 10.411448171339734: ((((_p_) - (_j_)) + ((_k_) - (_b_))) - ((cosf(_e_)) + (_c_))) - (_n_)
