In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import matplotlib.pyplot as plt
import scienceplots

import time
import math
import os
import random
from functools import partial
from decimal import Decimal
from collections import Counter
import numpy as np
import scipy.io as sio
import pysindy as ps
from tqdm import trange

from utils import *
from skimage.restoration import estimate_sigma
import bm3d
from solvel0 import solvel0
from best_subset import backward_refinement, brute_force_all_subsets
from UBIC import *
from kneed import KneeLocator
from bayesian_model_evidence import log_evidence

from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel

In [2]:
X_pre = np.load("./Cache/X_weak_RD_2025.npy").astype(np.float32)
y_pre = np.load("./Cache/y_weak_RD_2025.npy").astype(np.float32)
# Ground truth
ground_indices_u = np.array((0, 5, 6, 7, 8, 11, 17))
ground_coeff_u = np.array([1.000,-1.000,1.000,-1.000,1.000,0.100,0.100])
ground_indices_v = np.array((1, 5, 6, 7, 8, 12, 18))
ground_coeff_v = np.array([1.000,-1.000,-1.000,-1.000,-1.000,0.100,0.100])

In [3]:
from sklearn.preprocessing import StandardScaler
from sklearn import covariance
from abess import LinearRegression as AbessLinearRegression
from knockpy import KnockoffFilter, knockoff_stats, knockoffs
from scipy import stats
from statsmodels.stats.multitest import multipletests
### extra ###
from c2st.check import c2st # https://github.com/psteinb/c2st

X_scale = StandardScaler().fit_transform(X_pre)
X_pre_top = X_scale.copy()
y_pre = (y_pre-y_pre.mean())/y_pre.std()
y_pre = y_pre[:, 0:1]

# lr = SCO(path_type='gs', sparsity=10, ic_method='LinearSIC')
lr = AbessLinearRegression(path_type='gs', s_max=12, fit_intercept=False, cv=5, screening_size=0)
fstat = knockoff_stats.Eli5PIStatistic(model=lr, n_iter=10)
# fstat = knockoff_stats.ShapStatistic(model=lr)
kfilter = KnockoffFilter(ksampler='gaussian', fstat=fstat, knockoff_kwargs={'method':'ci'})
# kfilter = KnockoffFilter(ksampler='gaussian', fstat='lasso', knockoff_kwargs={'method':'ci'})

fdr = 0.2
rejections = []
np.random.seed(1234)
for _ in trange(50):
    rejection = kfilter.forward(X=X_pre_top, y=y_pre.flatten(), fdr=fdr, shrinkage="ledoitwolf", recycle_up_to=0.5)
    rejection = set(np.where(rejection == 1)[0])
    if len(rejection) > 0:
        rejections.append(rejection)

alibi is not installed in the environment.


100%|████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:35<00:00,  1.41it/s]


In [4]:
assert set(ground_indices_u).issubset(biggest_superset(rejections))
rejections = np.array(sorted(biggest_superset(rejections)))
X_pre_top = X_pre_top[:, rejections]

In [5]:
alpha = 0.01
while True:
    non_null_indices, shap_values = shap_model_selection(X_pre_top, y_pre)
    rejections = rejections[non_null_indices]
    X_pre_top = X_pre_top[:, non_null_indices]
    print(abs(shap_values).mean(axis=0))
    
    min_bic = np.inf
    non_null_indices = []
    for j in range(len(rejections)):
        classifier_confidences = []
        for _ in trange(50):
            Xk = knockoffs.GaussianSampler(X_pre_top, Sigma=covariance.LedoitWolf().fit(X_pre_top).covariance_, 
                                           method='ci').sample_knockoffs()
            Xn = X_pre_top.copy()
            Xn[:, j] = Xk[:, j]
            
            swap_explainer = shap.explainers.Linear(linear_model.LinearRegression(fit_intercept=False).fit(Xn, y_pre),
                                                    Xn)
            swap_shap_values = swap_explainer(Xn).values
            
            classifier_confidences.append(c2st(shap_values[:, j:j+1], swap_shap_values[:, j:j+1], clf=linear_model.LogisticRegression()))

        classifier_confidences = np.array(classifier_confidences)
        pv = stats.wilcoxon(classifier_confidences-0.51, alternative='greater').pvalue
        
        decision = classifier_confidences.mean() >= 0.51
        decision = pv < alpha
        print("binary classifier's acc:", classifier_confidences.mean())
        print("P-value:", pv)
        
        bic = sm.OLS(y_pre, X_pre_top[:, :j+1]).fit().bic
        if decision:
            print("Non-null", rejections[j])
            non_null_indices.append(j)
        if bic < min_bic:
            min_bic = bic
            min_j = j

    if len(non_null_indices) < len(rejections):
        non_null_indices = [_ for _ in non_null_indices if _ <= min_j]
        rejections = rejections[non_null_indices]
        X_pre_top = X_pre_top[:, non_null_indices]
        print(rejections)
    else:
        non_null_indices = [_ for _ in non_null_indices if _ <= min_j]
        rejections = rejections[non_null_indices]
        X_pre_top = X_pre_top[:, non_null_indices]
        print(rejections)
        break

[0.9723034  0.6119583  0.5902573  0.26487362 0.25998566 0.05612521
 0.05611704]


100%|████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:13<00:00,  3.69it/s]


binary classifier's acc: 0.5361060146979669
P-value: 8.881784197001252e-16
Non-null 0


100%|████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:13<00:00,  3.77it/s]


binary classifier's acc: 0.5311574532010276
P-value: 8.881784197001252e-16
Non-null 5


100%|████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:13<00:00,  3.82it/s]


binary classifier's acc: 0.5140616713763491
P-value: 4.5203196957999126e-05
Non-null 8


100%|████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:13<00:00,  3.75it/s]


binary classifier's acc: 0.5190485347462873
P-value: 8.881784197001252e-16
Non-null 7


100%|████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:13<00:00,  3.68it/s]


binary classifier's acc: 0.5187162087519396
P-value: 0.0011381125549254634
Non-null 6


100%|████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:13<00:00,  3.82it/s]


binary classifier's acc: 0.5209508734755968
P-value: 2.4802382370125997e-11
Non-null 17


100%|████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:13<00:00,  3.77it/s]

binary classifier's acc: 0.5242539271782064
P-value: 8.881784197001252e-16
Non-null 11
[ 0  5  8  7  6 17 11]





In [6]:
from mbic import mbic, mbic2, ebic
best_subsets = solvel0(X_pre_top, y_pre, miosr=True, refine=True)
best_subsets = [tuple(best_subsets[-1][_] for _ in bs) 
                for bs in brute_force_all_subsets(X_pre_top[:, best_subsets[-1]], y_pre)[1]]

  0%|                                                                                                  | 0/7 [00:00<?, ?it/s]

Set parameter Username
Academic license - for non-commercial use only - expires 2026-04-04


100%|██████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 39.43it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 94.91it/s]


In [7]:
# TODO: Calculate post_means for ARDRegression as well (Implement the ard_uncertainties function)
ard_uns = []
threshold_lambda = 5e5 # must pass assert 
for bs in best_subsets:
    ard = linear_model.ARDRegression(fit_intercept=False, 
                                     compute_score=True,
                                     threshold_lambda=threshold_lambda)
    ard.fit(X_pre_top[:, bs], y_pre.ravel())
    print(len(bs), end=', ')
    assert len(bs) == len(np.nonzero(ard.coef_)[0])
    pde_uncert = np.sqrt(np.diag(ard.sigma_)).sum()/abs(ard.coef_).sum()
    ard_uns.append(pde_uncert)
ard_uns = np.array(ard_uns)
ard_uns = ard_uns/min(ard_uns)
ard_uns

1, 2, 3, 4, 5, 6, 7, 

array([1.9008448, 1.       , 1.27763  , 1.755511 , 1.4406531, 2.698522 ,
       1.4739254], dtype=float32)

In [8]:
tau = 3
verbose = True
# scale = 1 <- generalized UBIC
scale = np.log(len(y_pre))
per = 75 # 80

post_means, b_bics, b_uns = baye_uncertainties(best_subsets, (X_pre_top, y_pre), 
                                               u_type='cv1', take_sqrt=True, 
                                               ridge_lambda=0, 
                                               threshold=0)
# b_uns = ard_uns # USE ard_uns INSTEAD
predictions = X_pre_top@post_means
print(b_bics)
print(b_uns)
b_bics = np.array(b_bics)
max_complexity = len(b_bics)
complexities = np.arange(max_complexity)+1
d_complexities = complexities[decreasing_values_indices(b_bics)]
d_bics = b_bics[decreasing_values_indices(b_bics)]
slopes = np.diff(d_bics)/(np.diff(d_complexities)*d_bics[:-1])
try:
    thres = np.percentile(np.abs(slopes), per)
    thres = math.ceil(sci_format(thres)[0])*10**sci_format(thres)[1]
except IndexError:
    thres = 1/40
min_thres = 1/40
thres = max(thres, min_thres)
print("threshold:", thres)

lower_bounds = []
for k, efi in enumerate(best_subsets):
    # assert len(efi) == np.count_nonzero(post_means[:, k:k+1])
    com = len(efi)
    lower_bound = 2*np.abs(log_like_value(predictions[:, k:k+1], y_pre))-np.log(len(y_pre))*com
    lower_bounds.append(lower_bound)

last_lam = np.log10(max(lower_bounds/(b_uns*scale)))
print("max_lam:", last_lam)
delta = last_lam/tau
now_lam = last_lam-delta
last_ubic = UBIC(b_bics, b_uns, len(y_pre), hyp=10**last_lam, scale=scale)
last_bc = np.argmin(last_ubic)
bc_seq = [last_bc]
while now_lam >= 0:
    now_ubic = UBIC(b_bics, b_uns, len(y_pre), hyp=10**now_lam, scale=scale)
    now_bc = np.argmin(now_ubic)
    
    diff_com = now_bc-last_bc
    diff_bic = b_bics[now_bc]-b_bics[last_bc]
    imp = np.nan
    if diff_com != 0:
        imp = abs(diff_bic/(b_bics[last_bc]*diff_com))
    
    if verbose:
        print(min(last_bc, now_bc), '<--->', max(last_bc, now_bc), 
              np.nan_to_num(imp, nan=np.inf))
    
    if (diff_com > 0 and (diff_bic > 0 or imp < thres)) or \
        (diff_com < 0 and diff_bic > 0 and imp > thres):
        break
    
    last_lam = now_lam
    now_lam = round(last_lam-delta, 8)
    last_ubic = now_ubic
    last_bc = now_bc
    if last_bc not in bc_seq:
        bc_seq.append(last_bc)

# best_bc = knee(range(len(last_ubic)), last_ubic, 0.95, 'linear', direction='decreasing')
best_bc = knee_finder(last_ubic)
if best_bc == 0 and last_bc != 0 and b_bics[last_bc] < b_bics[0] and \
                                    abs((b_bics[last_bc]-b_bics[0])/(b_bics[0]*last_bc)) > thres:
    best_bc = knee(range(1, len(last_ubic)), last_ubic[1:], 0.95, 'linear')
if best_bc < last_bc and abs((b_bics[last_bc]-b_bics[best_bc])/(b_bics[best_bc]*(last_bc-best_bc))) > thres:
    best_bc = last_bc
    
last_lam = round(last_lam, 8)
last_lam, last_ubic, last_bc, best_bc

[-1570.5347918807652, -32717.155818729454, -33772.23492494959, -35073.82038923738, -44091.669188415086, -44203.340873724956, -71125.40145125426]
[1.90093119 1.         1.27770228 1.75563732 1.44046352 2.70380766
 1.47338225]
threshold: 0.6000000000000001
max_lam: 3.719433618405566
6 <---> 6 inf
6 <---> 6 inf
6 <---> 6 inf


(0.0,
 array([ -1553.0265686 , -32707.94547836, -33760.46685207, -35057.65037195,
        -44078.40202907, -44178.4378849 , -71111.83109927]),
 6,
 1)