In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import matplotlib.pyplot as plt
import scienceplots

import math
import os
import random
from functools import partial
from decimal import Decimal
import numpy as np
# from sklearnex import patch_sklearn; patch_sklearn() # if you are using intel cpus
import scipy.io as sio
from scipy.integrate import solve_ivp
from scipy.signal import savgol_filter
import pysindy as ps
from pysindy.utils import lorenz
from tqdm import trange

# NSGA2, DNSGA2, SMSEMOA, AGEMOEA2
from pymoo.algorithms.moo.nsga2 import NSGA2
from pymoo.algorithms.moo.dnsga2 import DNSGA2
from pymoo.algorithms.moo.sms import SMSEMOA
from pymoo.algorithms.moo.age2 import AGEMOEA2
from pymoo.core.problem import ElementwiseProblem
from pymoo.core.sampling import Sampling
from pymoo.core.crossover import Crossover
from pymoo.core.mutation import Mutation
from pymoo.core.duplicate import ElementwiseDuplicateElimination
from pymoo.termination.default import DefaultMultiObjectiveTermination
from pymoo.optimize import minimize
from pymoo.visualization.scatter import Scatter

from utils import *
from skimage.restoration import estimate_sigma
import bm3d
from okridge.solvel0 import *
from best_subset import backward_refinement, brute_force_all_subsets
from solvel0 import solvel0
from UBIC import *
from kneed import KneeLocator

from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel

  from .autonotebook import tqdm as notebook_tqdm


Sklearn's version: 1.6.1


In [2]:
# Integrator keywords for solve_ivp
integrator_keywords = {}
integrator_keywords['rtol'] = 1e-12
integrator_keywords['method'] = 'LSODA'
integrator_keywords['atol'] = 1e-12

dt = 0.001
t_train = np.arange(0, 100, dt)
x0_train = [-8, 8, 27]
x_train = solve_ivp(lorenz, (t_train[0], t_train[-1]), 
                    x0_train, t_eval=t_train, **integrator_keywords).y.T
x_dot_train_measured = np.array(
    [lorenz(0, x_train[i]) for i in range(t_train.size)]
)
true_complexities = [2, 3, 2]

In [3]:
noise_level = 0 # 1e-2
noise = np.zeros(x_train.shape)
x_train_clean = x_train.copy()
# add noise
if noise_level > 0:
    noise = np.random.normal(scale=noise_level, size=x_train.shape)
    x_train = x_train_clean + noise
# denoise
if noise_level > 0:
    x_train = savgol_filter(x_train, 11, 3, axis=0)

In [4]:
# TODO: Implement TVDiff
n_poly = 4
n_ind = len(true_complexities)
n_modules = 8
differentiation_method = ps.differentiation.FiniteDifference()
differentiation_method = ps.differentiation.SmoothedFiniteDifference()

In [5]:
ode_lib = ps.WeakPDELibrary(function_library=ps.PolynomialLibrary(degree=n_poly, include_bias=False), 
                            spatiotemporal_grid=t_train,
                            include_bias=True,
                            K=5000,
                            differentiation_method=differentiation_method,
                            diff_kwargs={"is_uniform":True})

In [6]:
normalize = False
X_pre = np.array(ode_lib.fit_transform(x_train))
y_pre = np.array(ode_lib.convert_u_dot_integral(x_train))
max_features = np.ones((1, X_pre.shape[-1]))
if normalize:
    max_features = X_pre.max(axis=0)
    X_pre = X_pre / max_features

base_features = dict(zip(distribute_order(n_poly, n_ind), X_pre.T))
base_features[(0,0,0)] = np.full(base_features[(0,0,0)].shape, base_features[(0,0,0)].mean())

# solvel0(X_pre, y_pre[:, 0:1], miosr=True, refine=True, max_complexity=8)

# from joblib import Parallel, delayed
# res = Parallel(n_jobs=3)(delayed(okridge_solvel0_full)(
#     X_pre, select_column(y_pre, i), k=8) for i in range(3)
#                         )

# GA
# generate (a, b, c) where a, b, c are integers greater than or equal to 0 and a+b+c <= poly_order

In [7]:
class OdeDiscoveryProblem(ElementwiseProblem):
    def __init__(self, n_poly, n_inds, n_modules, 
                 base_features, target, epsilon=1, order_complexity=False):
        super().__init__(n_var=target.shape[-1], n_obj=2, n_ieq_constr=0)
        self.n_poly = n_poly
        self.n_inds = n_ind
        self.n_modules = n_modules
        self.base_features = base_features
        self.target = target
        self.n_target = target.shape[-1]
        self.epsilon = epsilon
        self.order_complexity = order_complexity
        self.sample_size = len(target)

    def _evaluate(self, X, out, *args, **kwargs):
        objective = [0., 0.]
        for v, genome in enumerate(X):
            coeff, mse = self.compute_genome_coefficient(genome, v)
            complexity_penalty = len(genome)
            if self.order_complexity:
                # should we have this additional complexity_penalty
                complexity_penalty += sum(sum(_) for _ in genome)
            complexity_penalty *= self.epsilon
            objective[0] += mse
            objective[1] += complexity_penalty
        out["F"] = objective
        
    def generate_ode_module(self):
        n_poly = self.n_poly
        module = [0 for _ in range(self.n_inds)]
        for i in range(self.n_inds):
            deg = random.randint(0, n_poly)
            module[i] = deg
            n_poly -= deg
            if n_poly <= 0:
                break
        return tuple(module)
        
    def numericalize_genome(self, genome):
        return np.stack([self.base_features[tuple(module)] 
                         for module in genome], axis=-1)

    def compute_genome_coefficient(self, genome, v):
        features = self.numericalize_genome(genome)
        features = features.reshape(-1, features.shape[-1])
        coeff, error, _, _ = np.linalg.lstsq(features, self.target[:, v:v+1])
        mse = error[0]/self.sample_size if len(error) > 0 else np.inf
        return coeff, mse

    def set_epsilon(self, epsilon):
        self.epsilon = epsilon

class PopulationSampling(Sampling):
    def _do(self, problem, n_samples, **kwargs):
        X = np.full((n_samples, problem.n_target), None, dtype=object)
        for tar in range(problem.n_target):
            X_set = set()
            i = 0
            while i < n_samples:
                n_modules = random.randint(1, problem.n_modules)
                genome = frozenset(problem.generate_ode_module() for _ in range(n_modules))
                if len(genome) > 0 and genome not in X_set:
                    X_set.add(genome)
                    X[i, tar] = genome
                    i += 1
        return X

class DuplicateElimination(ElementwiseDuplicateElimination):
    def is_equal(self, g1, g2):
        n_target = g1.X.shape[-1]
        if n_target != g2.X.shape[-1]:
            return False
        for i in range(n_target):
            if g1.X[i] != g2.X[i]:
                return False
        return True

In [8]:
class GenomeCrossover(Crossover):
    def __init__(self):
        # define the crossover: number of parents and number of offsprings
        super().__init__(2, 2)

    def _do(self, problem, X, **kwargs):
        # The input of has the following shape (n_parents, n_matings, n_target)
        _, n_matings, n_target = X.shape

        # The output owith the shape (n_offsprings, n_matings, n_target)
        # Because there the number of parents and offsprings are equal it keeps the shape of X
        Y = np.full_like(X, None, dtype=object)
        
        # for each mating provided
        for v in range(n_target):
            for k in range(n_matings):
                # get the first and the second parent
                Y[0, k, v], Y[1, k, v] = self.crossover_permutation(X[0, k, v], X[1, k, v])
                
        return Y
    
    def crossover_permutation(self, genome1, genome2):
        collection = list(genome1) + list(genome2)
        random.shuffle(collection)
        return frozenset(collection[:len(genome1)]), frozenset(collection[len(genome1):])
    
class GenomeMutation(Mutation):
    def __init__(self, add_rate=0.4, del_rate=0.5, order_rate=0.4):
        super().__init__()
        self.add_rate = add_rate
        self.del_rate = del_rate
        self.order_rate = order_rate

    def _do(self, problem, X, **kwargs):
        for v in range(X.shape[1]):
            for i in range(len(X)):
                if random.random() < self.add_rate:
                    X[i, v] = self.add_mutate(problem, X[i, v])
                if random.random() < self.del_rate:
                    X[i, v] = self.del_mutate(problem, X[i, v])
                if random.random() < self.order_rate:
                    X[i, v] = self.module_mutate(problem, X[i, v])
        return X
    
    def add_mutate(self, problem, genome, max_iter=3):
        for _ in range(max_iter):
            new_module = problem.generate_ode_module()
            if new_module not in genome:
                return genome.union(frozenset({new_module}))
        return genome
    
    def del_mutate(self, problem, genome, max_iter=3):
        genome = list(genome)
        lg = len(genome)
        if lg > 0:
            if lg == 1:
                for _ in range(max_iter):
                    new_module = problem.generate_ode_module()
                    if new_module != genome[0]:
                        return frozenset({new_module})
            else:
                genome.pop(random.randint(0, lg-1))
        return frozenset(genome)
    
    def module_mutate(self, problem, genome, max_iter=3):
        if len(genome) == 0:
            return genome
        genome = set(genome)
        genome.remove(random.choice(list(genome)))
        for _ in range(max_iter):
            new_module = problem.generate_ode_module()
            if new_module not in genome:
                genome.add(new_module)
                return frozenset(genome)
        return frozenset(genome)

In [9]:
pop_size = 1000
problem = OdeDiscoveryProblem(n_poly, n_ind, n_modules, base_features, y_pre[:, 0:1], 
                              order_complexity=True)
pop = PopulationSampling().do(problem, pop_size)
pop = [pop[i].X for i in range(len(pop))]
# problem.evaluate(pop)

In [10]:
n_optimal_models = 15

termination = DefaultMultiObjectiveTermination(
    xtol=1e-8,
    cvtol=1e-6,
    ftol=1e-8,
    period=50,
    n_max_gen=100,
    n_max_evals=200000
)

algorithm = DNSGA2(pop_size=pop_size,
                   sampling=PopulationSampling(),
                   crossover=GenomeCrossover(),
                   mutation=GenomeMutation(),
                   eliminate_duplicates=DuplicateElimination())

res = minimize(problem,
               algorithm,
               termination=termination,
               verbose=True)

n_gen  |  n_eval  | n_nds  |      eps      |   indicator  
     1 |     1000 |      8 |             - |             -
     2 |     3100 |      9 |  0.0588235294 |         ideal
     3 |     5200 |     12 |  0.2608695652 |         nadir
     4 |     7300 |     11 |  0.2068965517 |         nadir
     5 |     9400 |     13 |  0.0212201592 |             f
     6 |    11500 |     15 |  0.2368421053 |         nadir
     7 |    13600 |     17 |  0.0030959752 |             f
     8 |    15700 |     21 |  0.0112781955 |             f
     9 |    17800 |     23 |  0.0057208238 |             f
    10 |    19900 |     26 |  0.0952380952 |         nadir
    11 |    22000 |     25 |  0.0454545455 |         nadir
    12 |    24100 |     24 |  0.0066287879 |             f
    13 |    26200 |     26 |  0.0638297872 |         nadir
    14 |    28300 |     27 |  0.0015760441 |             f
    15 |    30400 |     27 |  0.0070921986 |             f
    16 |    32500 |     27 |  0.0007880221 |            

In [11]:
# (x0)' = -9.999 x0 + 9.999 x1
# (x1)' = 27.992 x0 + -0.999 x1 + -1.000 x0 x2
# (x2)' = -2.666 x2 + 1.000 x0 x1

epsilon = 10**sci_format(np.median(res.F[:, 0:1]))[1]
pareto_optimal_models = res.X[np.argsort(res.F[:, 0]+epsilon*res.F[:, 1])][:n_optimal_models]
best = pareto_optimal_models[0]
best

array([frozenset({(1, 0, 0), (0, 1, 0)})], dtype=object)