In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import matplotlib.pyplot as plt
import scienceplots

import time
import math
import os
import sys
import random
from functools import partial
from decimal import Decimal
import numpy as np
import scipy.io as sio
import pysindy as ps
from tqdm import trange

sys.path.insert(0, '../')
from utils import *
from solvel0 import solvel0, MIOSR
from best_subset import backward_refinement, brute_force_all_subsets
from UBIC import *
from bayesian_model_evidence import log_evidence

from skimage.restoration import estimate_sigma
import bm3d
from kneed import KneeLocator

from sklearn.preprocessing import StandardScaler
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel
from sklearn import covariance
from sklearn.linear_model import lars_path
from abess import LinearRegression as AbessLinearRegression
from knockpy import KnockoffFilter, knockoff_stats, knockoffs
from knockpy.utilities import estimate_covariance
from scipy import stats
from statsmodels.stats.multitest import multipletests
from c2st.check import c2st # https://github.com/psteinb/c2st

from mbic import mbic, mbic2, ebic

from rdata import read_rds
from selective_inference import forward_stop_rule, sfs_si, stepwise_selective_inference, subset_fdr
import fpsample
from dppy.finite_dpps import FiniteDPP

from si4pipeline import (
                        construct_pipelines, 
                        extract_features, 
                        initialize_dataset, 
                        intersection, 
                        lasso, 
                        marginal_screening, 
                        stepwise_feature_selection, 
                        union, 
                        PipelineManager
                        )

alibi is not installed in the environment.


In [2]:
data_path = "../Datasets/"
data = sio.loadmat(os.path.join(data_path, "burgers.mat"))
u_clean = (data['usol']).real; u = u_clean.copy()
x = (data['x'][0]).real
t = (data['t'][:,0]).real
xt = np.array([x, t], dtype=object)
dt = t[1]-t[0]; dx = x[2]-x[1]

In [3]:
np.random.seed(0)
noise_type = "gaussian"
noise_lv = float(50)
print("Noise level:", noise_lv)
noise = 0.01*np.abs(noise_lv)*(u.std())*np.random.randn(u.shape[0],u.shape[1])
u = u + noise

Noise level: 50.0


In [4]:
np.random.seed(0)
fake_noise = np.random.normal(loc=0.0, scale=estimate_sigma(u), size=u.shape)
sigmas = estimate_sigma(u+fake_noise)*np.arange(0.1, 2., 0.1)
est_sigma = sigmas[np.argmin([((u-bm3d.bm3d(u+fake_noise, sigma_psd=sigma, stage_arg=bm3d.BM3DStages.ALL_STAGES, blockmatches=(False, False)))**2).mean() \
                              for sigma in sigmas])]
u = bm3d.bm3d(u, sigma_psd=est_sigma, 
                  stage_arg=bm3d.BM3DStages.ALL_STAGES, 
                  blockmatches=(False, False))

In [5]:
n_poly = 6
n_derivatives = 6
n_weak = 2000
function_library = ps.PolynomialLibrary(degree=n_poly, include_bias=False)

weak_lib = ps.WeakPDELibrary(
    function_library=function_library,
    derivative_order=n_derivatives,
    spatiotemporal_grid=np.asarray([*np.meshgrid(x, t)]).T,
    include_bias=True,
    diff_kwargs={"is_uniform":True},
    K=n_weak
)

X_pre = np.array(weak_lib.fit_transform(np.expand_dims(u, -1)))
y_pre = weak_lib.convert_u_dot_integral(np.expand_dims(u, -1))
feature_names = np.array(weak_lib.get_feature_names())

In [6]:
### Cache ###
# X_pre = np.load("../Cache/X_pre_burgers_noise50.npy")
# y_pre = np.load("../Cache/y_pre_burgers_noise50.npy")
# u_pre = y_pre.copy()
# feature_names = np.load("../Cache/feature_names_burgers.npy")
# fsInf = read_rds("../R/R_data/fsInf_screening_burgers_noise50.rds")

### Selective inference ###

### Python

In [7]:
n_terms = 16
max_complexity = 12
alphas = [0.3, 0.25, 0.2, 0.15, 0.1, 0.05, 0.01]

### OPTION-I ###
# _, lars_p, _ = lars_path(StandardScaler().fit_transform(X_pre), y_pre.flatten(), method='lasso', alpha_min=1e-6, max_iter=1000)
# lars_p = np.array(list(map(int, lars_p)))[:n_terms]

### OPTION-II ###
nonzero = np.nonzero(AbessLinearRegression(s_min=1, s_max=n_terms, path_type='gs', fit_intercept=False, alpha=1e-9, max_iter=100).fit(X_pre, y_pre.flatten()).coef_)[0]
nonzero = np.nonzero(MIOSR(X_pre, y_pre, alpha=1e-9, non_zero=min(len(nonzero), n_terms)))[0]
_, lars_p, _ = lars_path(StandardScaler().fit_transform(X_pre[:, nonzero]), y_pre.flatten(), method='lasso', alpha_min=0)
lars_p = nonzero[np.array(list(map(int, lars_p)))][:n_terms]

X_test = X_pre[:, lars_p]
sigma = np.std(y_pre-X_test@np.linalg.lstsq(X_test, y_pre)[0], ddof=1)
manager = stepwise_selective_inference(support_size=len(lars_p))
_, p_list = manager.inference(X_test, y_pre, sigma)
print(lars_p, p_list, subset_fdr(p_list))

for alpha in alphas:
    adjusted_pvalues = p_list
    stop_step, false_discovery_rates = forward_stop_rule(adjusted_pvalues, alpha)
    adjusted_pvalues = adjusted_pvalues[:stop_step+1]
    rejections = np.sort(lars_p[:stop_step+1])
    if len(rejections) <= max_complexity: 
        break
max_fdr = alpha
max_fdr, feature_names[rejections], len(rejections)

Set parameter Username
Academic license - for non-commercial use only - expires 2026-04-04
[13  8 31 10 17 44 25 23 43 36  6 12 48  5 20 21] [0.0, 0.0, 0.0, 0.0, 0.0, 0.06405712341604064, 8.945066909404886e-13, 1.2557997974838031e-09, 0.010776968849477875, 0.0, 0.0, 8.84764189250653e-05, 0.0, 0.0, 1.4925283231548292e-11, 0.0002840211046996055] 0.004838052356998551


(0.01,
 array(['x0^5', 'x0^6', 'x0_11', 'x0_1111', 'x0_111111', 'x0x0_1',
        'x0^5x0_1', 'x0^2x0_11', 'x0^3x0_11', 'x0^5x0_11', 'x0x0_111',
        'x0x0_1111', 'x0^6x0_1111', 'x0x0_111111', 'x0^2x0_111111',
        'x0^6x0_111111'], dtype='<U13'),
 16)

### R

In [8]:
# max_complexity = 12
# alphas = [0.3, 0.2, 0.1, 0.05, 0.01]
# for alpha in alphas:
#     adjusted_pvalues = fsInf.get("pv")
#     stop_step, false_discovery_rates = forward_stop_rule(adjusted_pvalues, alpha)
#     adjusted_pvalues = adjusted_pvalues[:stop_step+1]
#     rejections = np.sort((fsInf.get("vars")-1).astype(np.int32)[:stop_step+1])
#     if len(rejections) <= max_complexity:
#         break
# max_fdr = alpha
# feature_names[rejections]

In [9]:
X_pre_top = X_pre[:, rejections]
X_pre_top = X_pre_top/np.linalg.norm(X_pre_top, 2, axis=0)

In [10]:
_, best_subsets = brute_force_all_subsets(X_pre_top, y_pre, max_support_size=8)

100%|██████████████████████████████████████████████████████████████████| 8/8 [00:10<00:00,  1.30s/it]


In [11]:
# Assume that mbics is a decreasing sequence
complexities = np.array([len(_) for _ in best_subsets])

if len(best_subsets) <= 2:
    knee = complexities.max()
else:
    ebics = []
    mbics = []
    for _ in best_subsets:
        loglik = log_like_value(X_pre_top[:, _]@np.linalg.lstsq(X_pre_top[:, _], y_pre, rcond=None)[0], 
                                y_pre)
        ebics.append(ebic(loglik, len(_), len(y_pre), X_pre_top.shape[-1], const=0))
        mbics.append(mbic(loglik, len(_), len(y_pre), X_pre_top.shape[-1], const=2))
    ebics = np.array(ebics)
    mbics = np.array(mbics)

    if np.alltrue(np.array(mbics) >= np.array([max(mbics)+_*(min(mbics)-max(mbics))/(np.argmin(mbics)-np.argmax(mbics)) for _ in range(len(best_subsets))])):
        knee = complexities.max()
    else:    
        decreasing_indices = np.array(mbics) <= np.array([max(mbics)+_*(min(mbics)-max(mbics))/(np.argmin(mbics)-np.argmax(mbics)) for _ in range(len(best_subsets))])
        knee = knee_finder(mbics[decreasing_indices])
        knee = (complexities[decreasing_indices])[knee]
    
knee

2

In [12]:
np.random.seed(0); random.seed(0)
n_samples = min(int(250*knee), len(y_pre))
false_discovery_control_method = 'bh'
print("max fdr:", max_fdr)
fdr_data = []
for bs in best_subsets:
    fdrs = []
    for _ in range(len(y_pre)//n_samples):
        X_test = X_pre_top[:, bs]
        y_test = y_pre.ravel()
        
        np.random.seed(random.randint(0, 100))
        # sample_indices = sorted(set([np.random.randint(len(y_pre)) for _ in range(n_samples)]))
        sample_indices = fpsample.bucket_fps_kdline_sampling(X_test, n_samples=n_samples, h=3) # Farthest Point Sampling (FPS) is better!!!
        X_test = X_test[sample_indices]; y_test = y_test[sample_indices]
        # FPS + k-DPP
        DPP = FiniteDPP('likelihood', **{'L': X_test.dot(X_test.T)})
        DPP.flush_samples()
        for _ in range(n_samples//(len(bs))):
            DPP.sample_exact_k_dpp(size=len(bs))
        sample_indices = np.unique(np.ravel(DPP.list_of_samples))
        X_test = X_test[sample_indices]; y_test = y_test[sample_indices]
        
        manager = stepwise_selective_inference(support_size=X_test.shape[1])
        M, p_list = manager.inference(X_test, y_test, np.std(y_test))
        if false_discovery_control_method is not None:
            p_list = stats.false_discovery_control(p_list, method=false_discovery_control_method)
        # print(M, p_list, np.array(p_list) < 0.05)
        fdrs.append(subset_fdr(p_list))
        
    fdrs = np.array(fdrs)
    if fdrs.mean() < 1:
        print(len(bs), fdrs.mean(), stats.wilcoxon(fdrs-max_fdr, alternative='less').pvalue)
        fdr_data.append(fdrs)
        
fdr_data = np.array(fdr_data)

max fdr: 0.01
1 0.0 0.0625
2 5.963071703582124e-11 0.0625
3 0.04096051873249207 1.0
4 0.0915303306141322 1.0
5 0.3244726513182708 1.0
6 0.4194391611415963 1.0
7 0.3874262721222558 1.0
8 0.519674554645266 1.0


In [13]:
from sklearn.cluster import AffinityPropagation, KMeans
print(AffinityPropagation().fit(fdr_data).labels_)
print(KMeans(n_clusters=2).fit(fdr_data).labels_)
# plt.plot([1, 2, 3, 4], fdr_data.mean(axis=-1), 'o'); plt.show()

[0 0 0 0 1 1 1 1]
[0 0 0 0 1 1 1 1]


In [14]:
np.random.seed(0); random.seed(0)
n_samples = min(int(250*knee), len(X_pre))
false_discovery_control_method = 'bh'
print("max fdr:", max_fdr)
for bs in best_subsets:
    fdrs = []
    for _ in range(len(y_pre)//n_samples):
        X_test = X_pre_top[:, bs]
        y_test = y_pre.ravel()
        
        np.random.seed(random.randint(0, 100))
        # sample_indices = sorted(set([np.random.randint(len(y_pre)) for _ in range(n_samples)]))
        sample_indices = fpsample.bucket_fps_kdline_sampling(X_test, n_samples=n_samples, h=3) # Farthest Point Sampling (FPS) is better!!!
        X_test = X_test[sample_indices]; y_test = y_test[sample_indices]
        # FPS + k-DPP
        # DPP = FiniteDPP('likelihood', **{'L': X_test.dot(X_test.T)})
        # DPP.flush_samples()
        # for _ in range(n_samples//(len(bs))):
        #     DPP.sample_exact_k_dpp(size=len(bs))
        # sample_indices = np.unique(np.ravel(DPP.list_of_samples))
        # X_test = X_test[sample_indices]; y_test = y_test[sample_indices]
        
        manager = stepwise_selective_inference(support_size=X_test.shape[1])
        M, p_list = manager.inference(X_test, y_test, np.std(y_test))
        if false_discovery_control_method is not None:
            p_list = stats.false_discovery_control(p_list, method=false_discovery_control_method)
        # print(M, p_list, np.array(p_list) < 0.05)
        fdrs.append(subset_fdr(p_list))
        
    fdrs = np.array(fdrs)
    print(fdrs.mean(), stats.wilcoxon(fdrs-max_fdr, alternative='less').pvalue)

max fdr: 0.01
0.0 0.0625
0.0 0.0625
0.00706043037469649 0.0625
0.02662044933457 1.0
0.17572923806322494 1.0
0.18509861285699977 1.0
0.17351220136329723 1.0
0.21174723884084515 1.0
