In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import matplotlib.pyplot as plt
import scienceplots

import math
import os
import random
from functools import partial
from decimal import Decimal
import numpy as np
import scipy.io as sio
import pysindy as ps
from tqdm import trange

from pymoo_ga import *
# NSGA2, DNSGA2, SMSEMOA, AGEMOEA2
from pymoo.algorithms.moo.nsga2 import NSGA2
from pymoo.algorithms.moo.dnsga2 import DNSGA2
from pymoo.algorithms.moo.sms import SMSEMOA
from pymoo.algorithms.moo.age2 import AGEMOEA2
from pymoo.termination.default import DefaultMultiObjectiveTermination
from pymoo.optimize import minimize

from utils import *
from skimage.restoration import estimate_sigma
import bm3d
# from okridge.solvel0 import *
from solvel0 import solvel0, MIOSR
from best_subset import backward_refinement, brute_force, brute_force_all_subsets
from UBIC import *
from kneed import KneeLocator
from bayesian_model_evidence import log_evidence

from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel
from sklearn.preprocessing import StandardScaler
from sklearn import covariance
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.linear_model import BayesianRidge, ARDRegression, lars_path
from bayesian_linear_regression import BayesianLinearRegression

mrmr is not installed in the env you are using. This may cause an error in future if you try to use the (missing) lib.
L0BnB is not installed.


In [2]:
n_poly = 6
n_derivatives = 6
n_modules = 8

In [3]:
data_path = "../PDE-Discovery-EC/Datasets/"
print(os.listdir(data_path))
data = sio.loadmat(os.path.join(data_path, "KdV_rudy.mat"))
u_clean = (data['usol']).real; u = u_clean.copy()
x = data['x'].ravel()
t = data['t'].ravel()
dt = t[1]-t[0]; dx = x[2]-x[1]

['KdV_sine_rep_big.mat', 'kuramoto_sivishinky.mat', 'lorenz100.npy', 'Wave_equation', 'KdV_rudy.mat', 'lorenz10.npy', 'KG_Exp.mat', 'burgers.mat']


### Add noise

In [4]:
np.random.seed(0)
noise_type = "gaussian"
noise_lv = float(50)
print("Noise level:", noise_lv)
noise = 0.01*np.abs(noise_lv)*(u.std())*np.random.randn(u.shape[0],u.shape[1])
u = u + noise

Noise level: 50.0


### Denoise

In [5]:
# load_denoised_data = True
# if load_denoised_data:
#     print("Loading denoised data...")
#     u = np.load(f"./Denoised_data/kdv_{noise_type}{int(noise_lv)}_bm3d.npy")
# else:
#     print("denoising...")
#     n_sampled_t = 30
    
#     kernel = RBF(length_scale=1, length_scale_bounds=(1e-2, 1e3)) + \
#             WhiteKernel(noise_level=1, noise_level_bounds=(1e-10, 1e10))
    
#     xx = colvec(x)
#     u_std = np.ones((u.shape[0], len(t)))
#     for i in trange(len(t)):    
#         gpr = GaussianProcessRegressor(kernel=kernel, alpha=0.0, 
#                                        n_restarts_optimizer=10 # 20
#                                       )
    
#         gpr.fit(xx, u[:, i])
#         _, ustd = gpr.predict(xx, return_std=True)
#         u_std[:, i] = ustd

#     est_sigma = u_std.mean() # max also works well
#     # est_sigma = (est_sigma+estimate_sigma(u))/2
#     u = bm3d.bm3d(u, sigma_psd=est_sigma, 
#                   stage_arg=bm3d.BM3DStages.ALL_STAGES, 
#                   blockmatches=(False, False))

#     np.save(f"./Denoised_data/kdv_{noise_type}{int(noise_lv)}_bm3d.npy", u)

np.random.seed(0)
fake_noise = np.random.normal(loc=0.0, scale=estimate_sigma(u), size=u.shape)
sigmas = estimate_sigma(u+fake_noise)*np.arange(0.1, 2., 0.1)
est_sigma = sigmas[np.argmin([((u-bm3d.bm3d(u+fake_noise, sigma_psd=sigma, stage_arg=bm3d.BM3DStages.ALL_STAGES, blockmatches=(False, False)))**2).mean() \
                              for sigma in sigmas])]
u = bm3d.bm3d(u, sigma_psd=est_sigma, 
                  stage_arg=bm3d.BM3DStages.ALL_STAGES, 
                  blockmatches=(False, False))

In [6]:
xt = np.array([x.reshape(-1, 1), t.reshape(1, -1)], dtype=object)
X, T = np.meshgrid(x, t)
XT = np.asarray([X, T]).T

In [7]:
function_library = ps.PolynomialLibrary(degree=n_poly, include_bias=False)

weak_lib = ps.WeakPDELibrary(
    function_library=function_library,
    derivative_order=n_derivatives,
    p=n_derivatives,
    spatiotemporal_grid=XT,
    include_bias=True,
    K=10000, # 2000, 5000, 10000
    diff_kwargs={"is_uniform":True}
)

X_pre = np.array(weak_lib.fit_transform(np.expand_dims(u, -1)))
y_pre = weak_lib.convert_u_dot_integral(np.expand_dims(u, -1))
feature_names = np.array(weak_lib.get_feature_names())

# R_path = "./Cache/"
# np.save(os.path.join(R_path, f"X_pre_kdv_noise{int(noise_lv)}.npy"), X_pre)
# np.save(os.path.join(R_path, f"y_pre_kdv_noise{int(noise_lv)}.npy"), y_pre)
# np.save(os.path.join(R_path, f"feature_names_kdv.npy"), feature_names)

In [8]:
base_poly = np.array([[p, 0] for p in range(1, n_poly+1)])
base_derivative = np.array([[0, d] for d in range(1, n_derivatives+1)])
modules = [(0, 0)] if weak_lib.include_bias else []
modules += [(p, 0) for p in range(1, n_poly+1)] + \
            [(0, d) for d in range(1, n_derivatives+1)] + \
            [tuple(p+d) for d in base_derivative for p in base_poly]
assert len(modules) == len(weak_lib.get_feature_names())
base_features = dict(zip(modules, X_pre.T))
u_t = y_pre.copy()

### Genetic algorithm with NSGA-II

In [9]:
pop_size = 500
problem = PdeDiscoveryProblem(n_poly, n_derivatives, n_modules, 
                              base_features, u_t, order_complexity=False, ridge_lambda=1e-6)

In [10]:
load_pareto_front = True

if not load_pareto_front:
    termination = DefaultMultiObjectiveTermination(
        xtol=1e-8,
        cvtol=1e-6,
        ftol=1e-8,
        period=50,
        n_max_gen=100,
        n_max_evals=100000
    )

    from pymoo.algorithms.moo.sms import SMSEMOA

    # algorithm = NSGA2(
    #                 pop_size=pop_size, 
    #                 sampling=PopulationSampling(), 
    #                 crossover=GenomeCrossover(), 
    #                 mutation=GenomeMutation(), 
    #                 eliminate_duplicates=DuplicateElimination()
    #                 )

    # algorithm = DNSGA2(
    #                 pop_size=pop_size,
    #                 sampling=PopulationSampling(),
    #                 crossover=GenomeCrossover(),
    #                 mutation=GenomeMutation(),
    #                 eliminate_duplicates=DuplicateElimination()
    #                 )

    algorithm = SMSEMOA(
                    pop_size=pop_size,
                    sampling=PopulationSampling(),
                    crossover=GenomeCrossover(),
                    mutation=GenomeMutation(),
                    eliminate_duplicates=DuplicateElimination()
                    )

    res = minimize(problem, 
                   algorithm, 
                   termination=termination, 
                   verbose=True
                  )
    
    pareto_optimal_models = res.X
    np.save(f"./Cache/pf_SMSEMOA_kdv_noise{int(noise_lv)}.npy", pareto_optimal_models)

else:
    pareto_optimal_models = np.load(f"./Cache/pf_SMSEMOA_kdv_noise{int(noise_lv)}.npy", allow_pickle=True)

In [11]:
### OPTIONAL ###
from operator import itemgetter

effective_candidates = extract_unique_candidates(pareto_optimal_models)

new_pareto_optimal_models = []
for bs in backward_refinement([sorted([effective_candidates.index(_) for _ in list(pm[0])]) for pm in pareto_optimal_models], 
                              (problem.numericalize_genome(effective_candidates), y_pre)).get_best_subsets():
    bs = itemgetter(*bs)(effective_candidates)
    if type(bs[0]) is not tuple:
        bs = (bs,)
    new_pareto_optimal_models.append([frozenset(bs)])
pareto_optimal_models = np.array(new_pareto_optimal_models)
del new_pareto_optimal_models
pareto_optimal_models

array([[frozenset({(0, 1)})],
       [frozenset({(1, 1), (0, 3)})],
       [frozenset({(0, 1), (1, 1), (0, 3)})],
       [frozenset({(1, 1), (0, 3), (1, 3), (0, 5)})],
       [frozenset({(0, 1), (2, 1), (0, 3), (5, 1), (0, 5)})],
       [frozenset({(0, 1), (2, 1), (0, 3), (5, 1), (0, 5), (1, 3)})],
       [frozenset({(0, 1), (2, 1), (0, 3), (5, 1), (2, 3), (0, 5), (1, 3)})],
       [frozenset({(0, 1), (2, 1), (0, 3), (5, 1), (2, 3), (3, 3), (0, 5), (1, 3)})],
       [frozenset({(0, 1), (2, 1), (0, 3), (5, 1), (2, 3), (3, 3), (0, 5), (6, 3), (1, 3)})],
       [frozenset({(0, 1), (2, 1), (0, 3), (5, 1), (2, 3), (3, 3), (0, 5), (6, 6), (6, 3), (1, 3)})],
       [frozenset({(0, 1), (4, 0), (2, 1), (0, 3), (5, 1), (2, 3), (3, 3), (0, 5), (6, 6), (6, 3), (1, 3)})],
       [frozenset({(0, 1), (4, 0), (2, 1), (6, 5), (0, 3), (5, 1), (2, 3), (3, 3), (0, 5), (6, 6), (6, 3), (1, 3)})],
       [frozenset({(0, 1), (4, 0), (2, 1), (6, 5), (0, 3), (5, 1), (2, 3), (4, 5), (3, 3), (0, 5), (6, 6), (6, 3

### Top candidates by SHAP or Lasso/Lars path

In [12]:
# feature_importance = dict(zip(effective_candidates, [0.0 for _ in range(len(effective_candidates))]))

# for bs in pareto_optimal_models[1:]:
#     bs = list(bs[0])
#     shap_importance = shap_linear_importance(problem.numericalize_genome(bs), y_pre, scale=False)
#     for i, _ in enumerate(bs):
#         feature_importance[_] += shap_importance[i]

# top_candidates = sorted([(v, k) for k, v in feature_importance.items()], reverse=True)
# top_candidates = [v for k, v in top_candidates[:16]]

_, lars_p, _ = lars_path(StandardScaler().fit_transform(problem.numericalize_genome(effective_candidates)), 
                         y_pre.ravel(), method='lasso', alpha_min=1e-5)
top_candidates = np.array(effective_candidates)[lars_p].tolist()

top_candidates

[[0, 1],
 [0, 3],
 [1, 1],
 [0, 4],
 [6, 6],
 [0, 5],
 [4, 0],
 [1, 3],
 [2, 1],
 [5, 1],
 [6, 5],
 [2, 3],
 [6, 3],
 [4, 2]]

### Best-subset selections (Optional)

In [13]:
# X_pre_top = problem.numericalize_genome(top_candidates)
# X_pre_top, X_pre_top_norm = normalize_lp(X_pre_top, p=2, axis=0)

# best_subsets = solvel0(X_pre_top, y_pre, miosr=True, refine=True)
# pareto_optimal_models = [[np.array(top_candidates)[list(bs)]] for bs in best_subsets]

# _, _, pde_uncertainties = baye_uncertainties(best_subsets, (X_pre_top, y_pre), 
#                                              u_type='cv1', take_sqrt=True, 
#                                              ridge_lambda=0, 
#                                              threshold=0)

# best_subsets, pde_uncertainties

### Uncertainty quantification

In [14]:
numericalize_genome = False
F = {}

# for each number of active terms -> keep the best coef (in terms of ssr) and track its uncertainty...
for bs in pareto_optimal_models:
    numerical_genome = problem.numericalize_genome(bs[0])
    if numericalize_genome:
        numerical_genome = normalize_lp(numerical_genome)[0]
    
    # um = BayesianLinearRegression() # seems to work well with numericalize_genome = True
    um = ARDRegression(fit_intercept=False, compute_score=True, max_iter=1000)
    um.fit(numerical_genome, y_pre.ravel())
    
    # number of effective parameters
    um_n_params = np.count_nonzero(um.coef_)
    # SSR
    ssr = np.sum((um.predict(numerical_genome) - y_pre.ravel())**2)
    # IC
    bic = BIC_AIC(um.predict(numerical_genome), y_pre.ravel(), um_n_params)[0]
    # PDE uncertainty
    pde_uncertainty = np.linalg.norm(np.sqrt(np.diag(um.sigma_)), 1)/np.linalg.norm(um.coef_, 1)

    pde_stat = (ssr, pde_uncertainty)
    if um_n_params not in F or F[um_n_params] < pde_stat:
        F[um_n_params] = pde_stat

del numerical_genome
assert len(pareto_optimal_models) > 2

F = np.column_stack((list(F.keys()), list(F.values())))
F

array([[1.00000000e+00, 1.50358996e+00, 2.29351010e-03],
       [2.00000000e+00, 4.91956095e-01, 2.25109788e-03],
       [3.00000000e+00, 4.28116439e-01, 7.21473237e-03],
       [4.00000000e+00, 3.53383350e-01, 5.90528193e-03],
       [5.00000000e+00, 3.47036961e-01, 1.46589398e-02],
       [6.00000000e+00, 3.30501413e-01, 1.85889972e-02],
       [7.00000000e+00, 3.20173507e-01, 1.84666373e-02],
       [8.00000000e+00, 3.12971435e-01, 2.14075350e-02],
       [9.00000000e+00, 3.11206631e-01, 3.62476325e-02],
       [1.10000000e+01, 3.09663329e-01, 3.82290058e-02],
       [1.20000000e+01, 3.09643271e-01, 3.83328409e-02],
       [1.40000000e+01, 3.08433342e-01, 1.15373287e-01]])

### MCDM/MCDA ###

In [15]:
from collections import Counter
from pymcdm import weights as obj_w
from compromise_programming import mcdm
from bayesian_model_evidence import log_evidence

include_uncertainty = False
use_information_criterion = False

# Pseudocode: ใช้ F ได้เลยไม่ต้อง nF
nF = F.copy()
nF[:, -1] = nF[:, -1] / nF[:, -1].min()
if use_information_criterion:
    nF[:, -2] = nF[:, -2] - nF[:, -2].min()
if not include_uncertainty:
    nF = nF[:, :-1]

types = np.array([-1 for _ in range(nF.shape[-1])])
# mcdm weights
obj_weights = obj_w.gini_weights(nF, types=types)
print("Weights:", obj_weights)
# recursive mcdm
filtered_F = nF.copy()
while len(filtered_F) > 2:
    ranks, prefs = mcdm(filtered_F, obj_weights, types)
    most_common = Counter(np.argmin(ranks, axis=1)).most_common()
    most_common = sorted(most_common, key=lambda _: (_[1], _[0]), reverse=True)
    print(filtered_F, most_common)

    # keep_until = max(most_common, key=lambda _: _[0])[0]
    keep_until = most_common[0][0]
    filtered_F = filtered_F[:keep_until+1]
    if len(most_common) == 1:
        break

Weights: [0.56903438 0.43096562]
[[ 1.          1.50358996]
 [ 2.          0.49195609]
 [ 3.          0.42811644]
 [ 4.          0.35338335]
 [ 5.          0.34703696]
 [ 6.          0.33050141]
 [ 7.          0.32017351]
 [ 8.          0.31297144]
 [ 9.          0.31120663]
 [11.          0.30966333]
 [12.          0.30964327]
 [14.          0.30843334]] [(1, 4)]


### Intercept or NO Intercept? ###

In [16]:
# true_indices = [9, 13]
# true_coefficients = [-1, -6]
# true_ols = sm.OLS(y_pre, X_pre[:, true_indices]).fit()
# estimated_coefficients = true_ols.params
# print(estimated_coefficients, mean_absolute_percentage_error(true_coefficients, estimated_coefficients))
# true_ols.bic, sm.OLS(y_pre, X_pre[:, [0] + true_indices]).fit().bic