In [68]:
import jax
from jax import lax, random, numpy as jnp
from jax.tree_util import tree_flatten, tree_unflatten
import wandb
import flax
from flax import linen as nn

import sympy as sy
from sympy.core.rules import Transform
import numpy as np

import sys
sys.path.append("..")
sys.path.append("../../orient/")


from eql.eqlearner import EQL, EQLdiv
from eql.symbolic import get_symbolic_expr_div, get_symbolic_expr
from eql.np_utils import flatten, unflatten


import optax
import scipy
from functools import partial
import matplotlib.pyplot as plt


In [None]:
wandb.init(
    # set the wandb project where this run will be logged
    project="Example_3",

    # track hyperparameters and run metadata
    config={
    "learning_rate": 0.0001,
    "architecture": "EQLdiv_2_Layers",
    "epochs": 20000,
    "optimizer": "Adam",
    "regularization": "No_reg(5000)+L1(14000)+Pruning(1000)",
    "Batchsize": 10000,
    "Reg_Factor": 0.1,
    "Threshold_mask": 0.001,
    "input_dim": 4,
    "output_dim": 1,
    "Threshold_div": "1/sqrt(t+1)"
    }
)

In [70]:
funs = ['mul', 'cos' , 'sin', 'id', 'id', 'id', 'id']*3
e = EQLdiv(n_layers=2, functions=funs, features=1)
key = random.PRNGKey(0)

In [71]:
N = 10000
xdim = 4
x = (random.uniform(key, (N, xdim))-.5) * 2
#x = np.array([[1., 2.]]).T
#x = np.linspace(-1, 1, N)[:,None]
#y = x[:,0] + jnp.cos(x[:,1])
#y = (jnp.cos(x) + 1 - x**2)/(x-3)**3


#y = np.sin(np.pi * x[:,0])/(x[:,1]**2 + 1)
y = 1./3. * ((1.+x[:,1])*np.sin(np.pi*x[:,0]) + x[:,1]*x[:,2]*x[:,3])
#plt.scatter(x[:,0], x[:,1], c=y)

In [72]:
params = e.init({'params':key}, x, 1.0)

In [73]:
def mse_fn(params, threshold):
    pred, _ = e.apply(params, x, threshold)
    return jnp.mean((pred-y)**2)

def mse_b_fn(params, threshold):
    pred, b = e.apply(params, x, threshold)
    return jnp.mean((pred-y)**2), b

def mse_b_y_fn(params, threshold):
    pred, b = e.apply(params, x, threshold)
    return jnp.mean((pred-y)**2), b, pred


def get_mask_spec(thresh, params):
    flat, spec = tree_flatten(params)
    mask = [jnp.abs(f) > thresh for f in flat]
    return mask, spec

def apply_mask(mask, spec, params):
    flat, _ = tree_flatten(params)
    masked_params = tree_unflatten(spec, [f*m for f,m in zip(flat, mask)])
    return masked_params


def get_masked_mse(thresh, params):
    mask, spec = get_mask_spec(thresh, params)
    def masked_mse(params, threshold):
        masked_params = apply_mask(mask, spec, params)
        return mse_fn(masked_params, threshold)
    return jax.jit(masked_mse)
    

def l1_fn(params):
    return sum(
        jnp.abs(w).mean() for w in jax.tree.leaves(params["params"])
    )

def reg_fn(threshold, b):
    return (jnp.maximum(0, threshold - b)).sum()

def penalty_fn(y, B=10, supp=3):
    penalty_fn.key, _ = random.split(key)
    xr = (random.uniform(penalty_fn.key, (N, xdim))-.5) * supp
    return jnp.sum(jnp.maximum(y-B, 0)+jnp.maximum(-y-B, 0))
penalty_fn.key = key

In [74]:
def get_loss(lamba):
    def loss_fn(params, threshold):
        mse, b = mse_b_fn(params, threshold)
        return mse  + lamba * l1_fn(params) + reg_fn(threshold, b)
    return loss_fn

def get_loss_pen():
    def loss_fn(params, threshold):
        mse, b, y = mse_b_y_fn(params, threshold)
        return penalty_fn(y) + reg_fn(threshold, b)
    return loss_fn

def get_loss_grad(lamba=1e-3, is_penalty=False):
    if is_penalty:
        loss = get_loss_pen()
    else:
        loss = get_loss(lamba)
    return jax.jit(jax.value_and_grad(loss))

In [75]:
tx = optax.adam(learning_rate=1e-4)
opt_state = tx.init(params)

In [76]:
loss_grad_pen = get_loss_grad(is_penalty=True)
loss_grad_1 = get_loss_grad(0)
loss_grad_2 = get_loss_grad(1e-1)

In [77]:
def do_step(loss_grad, params, theta, opt_state):
    loss_val, grad = loss_grad(params, theta)
    updates, opt_state = tx.update(grad, opt_state)
    return optax.apply_updates(params, updates), opt_state, loss_val

In [78]:
T1 = 5000
Tpenalty = 500
for i in range(19000):
    theta = 1./jnp.sqrt(i/1 + 1)
    if i < T1:
        lg = loss_grad_1
    elif i >= T1:
        lg = loss_grad_2
    params, opt_state, loss_val = do_step(lg, params, theta, opt_state)
    if i % 50 == 0 and i > 0:
        print(loss_val, theta)
        wandb.log({"loss": loss_val, "division_threshold": theta})
        params, opt_state, loss_val = do_step(loss_grad_pen, params, theta, opt_state)
    
thr = 1e-3
loss_grad_masked = jax.jit(jax.value_and_grad(get_masked_mse(thr, params)))
mask, spec = get_mask_spec(thr, params)
T = 19000
for i in range(1000):
    theta = 1./jnp.sqrt(T/1 + 1)
    loss_val, grads = loss_grad_masked(params, theta)
    updates, opt_state = tx.update(grads, opt_state)
    params = optax.apply_updates(params, updates)
    mask, spec = get_mask_spec(thr, params)
    params = apply_mask(mask, spec, params)
    T +=1
    if i % 100 == 0:
        print(loss_val)
        wandb.log({"loss": loss_val, "division_threshold": theta})

1.0126877 0.14002801
0.92921835 0.099503726
0.82940906 0.08137884
0.71668327 0.07053456
0.59653574 0.06311944
0.47625625 0.05763904
0.3639161 0.05337605
0.26654124 0.049937617
0.18867475 0.04708816
0.131634 0.044676702
0.09356714 0.042601433
0.070370704 0.040790852
0.05725139 0.03919309
0.050127927 0.037769478
0.04621326 0.03649052
0.04389007 0.035333265
0.042327072 0.03427956
0.04113311 0.033314828
0.040134307 0.03242722
0.03925506 0.031606976
0.038461875 0.030845987
0.037738882 0.030137438
0.037077677 0.029475577
0.036473133 0.028855495
0.03592159 0.028272962
0.035420023 0.027724348
0.034965616 0.027206479
0.034555487 0.026716584
0.034186564 0.026252234
0.03385556 0.025811287
0.033558927 0.025391836
0.033292964 0.024992192
0.033053868 0.024610842
0.032837853 0.024246432
0.032641295 0.023897745
0.032460786 0.023563681
0.03229324 0.023243247
0.032135956 0.022935538
0.031986594 0.022639737
0.031843226 0.02235509
0.03170429 0.02208092
0.031568523 0.021816595
0.03143497 0.02156154
0.03130

In [91]:
params

{'params': {'last': {'bias': Array([0.        , 0.52510214], dtype=float32),
   'kernel': Array([[ 0.        , -0.04764245],
          [ 0.2628326 ,  0.        ],
          [ 0.        ,  0.        ],
          [ 0.        ,  0.        ],
          [ 0.        ,  0.        ],
          [ 0.        ,  0.        ],
          [ 0.        , -0.03786314],
          [-0.19450085,  0.        ],
          [ 0.        ,  0.        ],
          [ 0.        ,  0.        ],
          [ 0.        ,  0.        ],
          [ 0.        ,  0.        ],
          [ 0.        , -0.03659477],
          [ 0.        ,  0.        ],
          [ 0.        ,  0.        ],
          [ 0.        ,  0.        ],
          [ 0.        ,  0.        ],
          [ 0.        ,  0.        ],
          [ 0.23821977,  0.        ],
          [ 0.19652288,  0.        ],
          [ 0.        ,  0.        ]], dtype=float32)},
  'layers_0': {'linear_layer': {'bias': Array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.

In [79]:
symb = get_symbolic_expr_div(apply_mask(mask, spec, params), funs)[0]
symb = get_symbolic_expr_div(params, funs)[0]
symb

(0.196522876620293*(-0.571892663900542*x0 - 0.303250439104478*x1*x2 + 0.0962142419380507*x1 - 0.555720534711153*x3)*(0.556464492359865*x0 - 0.298132541192258*x1*x2 - 0.0954423126401025*x1 - 0.541433174803228*x3) + 0.238219767808914*(0.408310394052859*x0*x1 + 0.0627331025751587*x0 + 0.290798825379002*x1*x2 + 0.144915341342324*x1 - 0.492962948325648*x3 + 0.490165293216705)*(0.416472317729116*x0*x1 + 0.0632840140896462*x0 - 0.296423542361108*x1*x2 + 0.148989829790255*x1 + 0.502743308557733*x3 + 0.499514490365982) - 0.194500848650932*sin(-0.434955982322387*x0*x1 + 1.41475631658651*x0 + 0.311649726698974*x1 + 0.266796763986349) + 0.26283261179924*sin(0.757528618331796*x0*x1 + 2.47605270420621*x0 + 0.0955053170303133*x1 - 0.0348944216966629))/(0.439596552401781 - 0.0365947671234608*cos(0.428053245433383*x0*x1))

In [80]:
spec, fparam = flatten(params)
full_shape = fparam.shape
mask = jnp.abs(fparam) > 0.01
idxs = jnp.arange(fparam.shape[0])[mask]
count = sum(mask).item()

In [81]:
def red_loss_grad_fn(red_param):
    full_param = jnp.zeros(full_shape).at[idxs].set(red_param)
    full_param = unflatten(spec, full_param)

    #return mse_fn(full_param, 1e-4)
    loss, grad = loss_grad_1(full_param, 1e-4)
    _, grad = flatten(grad)
    return loss, np.array(grad)[idxs,]
    
#red_loss_grad = jax.jit(jax.value_and_grad(red_mse_fn))

In [82]:
x0, f, info = scipy.optimize.fmin_l_bfgs_b(
        red_loss_grad_fn,
        x0 = np.array(fparam[mask]),
        factr=1.,
        m=500,
        pgtol=1e-13,
        maxls=100)
#x0[np.abs(x0) < 1e-3] = 0.0

In [83]:
f

1.2868163423718215e-07

In [84]:
final_param = unflatten(spec, jnp.zeros(full_shape).at[idxs].set(x0))

In [85]:
symb = get_symbolic_expr_div(final_param, funs)[0]
symb

(0.441472858190536*(-1.25568286401258*x0 - 0.0531444274399164*x1*x2 + 0.0230347746280475*x1 - 0.430609215012872*x3)*(1.21964024247799*x0 - 0.0544555855618636*x1*x2 - 0.0223931251368926*x1 - 0.39733614009198*x3) + 0.365108609199524*(-0.00123586160376603*x0*x1 + 1.33193483516241*x0 - 0.0597104248937456*x1*x2 + 0.0271901778472066*x1 + 0.457182527748664*x3 + 0.0519460383802652)*(-0.00120585908931993*x0*x1 + 1.38780800737631*x0 + 0.0589845953150548*x1*x2 + 0.0256236895707744*x1 - 0.452517686872525*x3 + 0.0514163561165333) - 0.494146972894669*sin(-0.0724506731615455*x0*x1 + 2.27833418079157*x0 + 0.00155771751490895*x1 + 0.00985350087285042) + 0.493932992219925*sin(0.0741776887875744*x0*x1 + 2.42530852382615*x0 - 0.000448779117734478*x1 + 0.0078737223520875))/(0.16420105099678*cos(0.0456507417675684*x0*x1) - 0.0460177510976791)

In [86]:
def clean_expr(expr):
    # WARNING: might return 0/NaN/inf if expression only contains small numbers
    def prune(expr, thr=1e-5):
        return expr.replace(lambda x: x.is_Number and abs(x) < thr, lambda x: 0)
    
    def rounding(expr, dig=3):
        return expr.xreplace(Transform(lambda x: x.round(dig), lambda x: x.is_Number))
    
    # prune small numbers
    expr = prune(expr)
    # round number
    expr = rounding(expr)
    # expand
    expr = prune(sy.expand(expr), 1e-3)
    return expr
    # return sy.simplify(expr)

In [87]:
clean_expr(symb)

0.05085*x0*x1/(0.164*cos(0.046*x0*x1) - 0.046) + 0.05115*x0/(0.164*cos(0.046*x0*x1) - 0.046) + 0.0393*x1*x2*x3/(0.164*cos(0.046*x0*x1) - 0.046) + 0.494*sin(0.074*x0*x1 + 2.425*x0 + 0.008)/(0.164*cos(0.046*x0*x1) - 0.046) - 0.494*sin(-0.072*x0*x1 + 2.278*x0 + 0.002*x1 + 0.01)/(0.164*cos(0.046*x0*x1) - 0.046)

In [88]:
f

1.2868163423718215e-07

In [89]:
symb


(0.441472858190536*(-1.25568286401258*x0 - 0.0531444274399164*x1*x2 + 0.0230347746280475*x1 - 0.430609215012872*x3)*(1.21964024247799*x0 - 0.0544555855618636*x1*x2 - 0.0223931251368926*x1 - 0.39733614009198*x3) + 0.365108609199524*(-0.00123586160376603*x0*x1 + 1.33193483516241*x0 - 0.0597104248937456*x1*x2 + 0.0271901778472066*x1 + 0.457182527748664*x3 + 0.0519460383802652)*(-0.00120585908931993*x0*x1 + 1.38780800737631*x0 + 0.0589845953150548*x1*x2 + 0.0256236895707744*x1 - 0.452517686872525*x3 + 0.0514163561165333) - 0.494146972894669*sin(-0.0724506731615455*x0*x1 + 2.27833418079157*x0 + 0.00155771751490895*x1 + 0.00985350087285042) + 0.493932992219925*sin(0.0741776887875744*x0*x1 + 2.42530852382615*x0 - 0.000448779117734478*x1 + 0.0078737223520875))/(0.16420105099678*cos(0.0456507417675684*x0*x1) - 0.0460177510976791)

In [90]:
wandb.finish()

0,1
division_threshold,█▆▅▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss,█▁▁▁▁▁▁▁▁▁▁▁▁▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁

0,1
division_threshold,0.00709
loss,5e-05
