In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import os
import math
import random
import numpy as np
import scipy.io as sio
from tqdm import trange
import sys; sys.path.insert(0, '../')
from selective_inference import forward_stop_rule, sfs_si, stepwise_selective_inference, subset_fdr
from sklearn.linear_model import LassoLars, lars_path
from abess import LinearRegression
from sklearn.preprocessing import StandardScaler
from best_subset import brute_force_all_subsets, brute_force
# from okridge.solvel0 import okridge_solvel0, okridge_solvel0_full
from solvel0 import solvel0
import statsmodels.api as sm
from scipy import stats
import fpsample
from dppy.finite_dpps import FiniteDPP
from kneefinder import KneeFinder
from weakident_python.weakindent_model import weak_ident
from skimage.restoration import estimate_sigma
import bm3d
import pysindy as ps
from utils import *
from UBIC import *

In [2]:
X_pre = np.load("../Cache/X_poly_NS.npy")
y_pre = np.load("../Cache/y_poly_NS.npy")
feature_names = np.load("../Cache/feature_names_poly_NS.npy", allow_pickle=True)

y_pre = y_pre[:, 0:1]

### Knockoffs

In [3]:
from hidimstat import (model_x_knockoff, 
                    model_x_knockoff_pvalue, 
                    model_x_knockoff_bootstrap_quantile, 
                    model_x_knockoff_bootstrap_e_value)

# selected, test_scores, threshold, X_tildes = model_x_knockoff(X_pre, 
#                                                               y_pre, 
#                                                               centered=True, 
#                                                               n_bootstraps=25, 
#                                                               fdr=0.1, 
#                                                               n_jobs=-1)

from sklearn.utils import resample
from abess import LinearRegression as AbessLinearRegression
from knockpy import KnockoffFilter, knockoff_stats, knockoffs
from knockpy.utilities import estimate_covariance
from scipy import stats
from statsmodels.stats.multitest import multipletests
from c2st.check import c2st # https://github.com/psteinb/c2st

u_pre = y_pre.copy()
X_pre_top = StandardScaler().fit_transform(X_pre)
y_pre = StandardScaler().fit_transform(u_pre)

lr = AbessLinearRegression(path_type='gs', s_max=10, fit_intercept=False, cv=5, screening_size=0)
fstat = knockoff_stats.ShapStatistic(model=lr)
kfilter = KnockoffFilter(ksampler='gaussian', fstat=fstat, knockoff_kwargs={'method':'ci'})

# kfilter = KnockoffFilter(ksampler='gaussian', fstat='lasso', knockoff_kwargs={'method':'ci'})

alibi is not installed in the environment.


In [4]:
fdr = 1/3 # 1/4, 1/3, 1/2
rejections = []
test_scores = []
thresholds = []
for _ in trange(50):
    np.random.seed(_)
    X_resample, y_resample = resample(X_pre_top, y_pre.flatten(), n_samples=10000, replace=True, random_state=_)
    rejection = kfilter.forward(X=X_resample, y=y_resample, fdr=fdr, shrinkage="ledoitwolf", recycle_up_to=0.5, tol=1e-4)
    # rejection = kfilter.forward(X=X_pre_top, y=y_pre.flatten(), fdr=fdr, shrinkage="ledoitwolf", recycle_up_to=0.5, tol=1e-4)
    rejection = sorted(set(np.where(rejection == 1)[0]))
    if len(rejection) > 0:
        rejections.append(rejection)
        test_scores.append(kfilter.W)
        thresholds.append(kfilter.threshold)
del X_resample, y_resample

100%|██████████████████████████████████████████████████████████████████| 50/50 [00:22<00:00,  2.20it/s]


In [5]:
aggregated_ko_selection, _, _ = model_x_knockoff_bootstrap_quantile(test_scores, 
                                                                    fdr=fdr, 
                                                                    adaptive_aggregation=False)
print(feature_names[aggregated_ko_selection])

eval_selection, _, _ = model_x_knockoff_bootstrap_e_value(test_scores, thresholds, fdr=fdr)
print(feature_names[eval_selection])

['w' 'v' 'w_{xx}' 'w_{yy}' 'ww_{y}' 'uw_{x}' 'uw_{yy}' 'vw_{y}']
['w' 'v' 'w_{x}' 'w_{xx}' 'w_{yy}' 'ww_{y}' 'ww_{xx}' 'uw_{x}' 'uw_{xx}'
 'uw_{yy}' 'vw_{y}']


In [6]:
rejections = np.array(eval_selection)
X_pre_top = X_pre[:, rejections]

In [7]:
non_null_indices, shap_values = shap_model_selection(X_pre_top, y_pre)
scale_shap_values = abs(shap_values).mean(axis=0)
rejections = rejections[non_null_indices]
X_pre_top = X_pre_top[:, non_null_indices]
y_pre = u_pre.copy()
feature_names[rejections]

LinearExplainer explainer: 8342751it [00:36, 161225.82it/s]                                            


array(['uw_{x}', 'vw_{y}', 'w_{yy}', 'w_{xx}', 'w_{x}', 'w', 'uw_{yy}',
       'v'], dtype='<U7')

In [8]:
best_subsets = solvel0(X_pre_top, y_pre.flatten(), max_complexity=8)

  0%|                                                                            | 0/8 [00:00<?, ?it/s]

Set parameter Username
Academic license - for non-commercial use only - expires 2026-04-04


100%|████████████████████████████████████████████████████████████████████| 8/8 [00:12<00:00,  1.53s/it]


In [9]:
tau = 3
verbose = True
# scale = 1 <- generalized UBIC
scale = np.log(len(y_pre))
per = 75 # 80

post_means, b_bics, b_uns = baye_uncertainties(best_subsets, (X_pre_top, y_pre), 
                                               u_type='cv1', take_sqrt=True, 
                                               ridge_lambda=0, 
                                               threshold=0)
# b_uns = ard_uns # USE ard_uns INSTEAD
predictions = X_pre_top@post_means
print(b_bics)
print(b_uns)
b_bics = np.array(b_bics)
max_complexity = len(b_bics)
complexities = np.arange(max_complexity)+1
d_complexities = complexities[decreasing_values_indices(b_bics)]
d_bics = b_bics[decreasing_values_indices(b_bics)]
slopes = np.diff(b_bics)/(np.diff(complexities)*b_bics[:-1])
try:
    thres = np.percentile(np.abs(np.diff(d_bics)/(np.diff(d_complexities)*d_bics[:-1])), per)
    thres = np.round(sci_format(thres)[0])*10**sci_format(thres)[1]
except IndexError:
    thres = 1/40
min_thres = 1/40
thres = max(thres, min_thres)
print("threshold:", thres)

lower_bounds = []
for k, efi in enumerate(best_subsets):
    # assert len(efi) == np.count_nonzero(post_means[:, k:k+1])
    com = len(efi)
    lower_bound = 2*np.abs(log_like_value(predictions[:, k:k+1], y_pre))-np.log(len(y_pre))*com
    lower_bounds.append(lower_bound)

last_lam = np.log10(max(lower_bounds/(b_uns*scale)))
print("max_lam:", last_lam)
delta = last_lam/tau
now_lam = last_lam-delta
last_ubic = UBIC(b_bics, b_uns, len(y_pre), hyp=10**last_lam, scale=scale)
last_bc = np.argmin(last_ubic)
bc_seq = [last_bc]
while now_lam >= 0:
    now_ubic = UBIC(b_bics, b_uns, len(y_pre), hyp=10**now_lam, scale=scale)
    now_bc = np.argmin(now_ubic)
    
    diff_com = now_bc-last_bc
    diff_bic = b_bics[now_bc]-b_bics[last_bc]
    imp = np.nan
    if diff_com != 0:
        imp = abs(diff_bic/(b_bics[last_bc]*diff_com))
    
    if verbose:
        print(min(last_bc, now_bc), '<--->', max(last_bc, now_bc), 
              np.nan_to_num(imp, nan=np.inf))
    
    if (diff_com > 0 and (diff_bic > 0 or imp < thres)) or \
        (diff_com < 0 and diff_bic > 0 and imp > thres):
        break
    
    last_lam = now_lam
    now_lam = round(last_lam-delta, 8)
    last_ubic = now_ubic
    last_bc = now_bc
    if last_bc not in bc_seq:
        bc_seq.append(last_bc)

# best_bc = knee(range(len(last_ubic)), last_ubic, 0.95, 'linear', direction='decreasing')
best_bc = knee_finder(last_ubic)
if best_bc == 0 and last_bc != 0 and b_bics[last_bc] < b_bics[0] and \
                                    abs((b_bics[last_bc]-b_bics[0])/(b_bics[0]*last_bc)) > thres:
    best_bc = knee(range(1, len(last_ubic)), last_ubic[1:], 0.95, 'linear')
if best_bc is None:
    best_bc = knee_finder(last_ubic)
    
last_lam = round(last_lam, 8)
last_lam, last_ubic, last_bc, best_bc

[7470686.468674479, -8719386.44663852, -18119349.71805418, -20376262.737041958, -20758605.52975359, -20927376.865343284, -21058574.52621897, -21153392.298425823]
[3.64343152 1.87341745 1.08312667 1.         1.44953855 2.99155365
 3.08361455 4.55120681]
threshold: 0.6000000000000001
max_lam: 6.106720590157475
3 <---> 4 0.018764127536330322


(6.10672059,
 array([8.17102043e+07, 2.94538598e+07, 3.95072384e+06, 1.86264515e-08,
        8.77757288e+06, 4.00293064e+07, 4.17739657e+07, 7.15831933e+07]),
 3,
 1)

In [10]:
abs((b_bics[last_bc]-b_bics[best_bc])/(b_bics[best_bc]*(last_bc-best_bc)))

0.668445902801875

In [11]:
np.random.seed(0); random.seed(0)
n_samples = min(int(250*(last_bc+1)), len(y_pre))
max_iters = len(y_pre)//n_samples
max_iters = min(max_iters, 50)
false_discovery_control_method = None
fdr_data = []
for bs in best_subsets:
    fdrs = []
    for _ in trange(max_iters):
        X_test = X_pre_top[:, bs]
        y_test = y_pre.ravel()
        
        np.random.seed(random.randint(0, 100))
        # sample_indices = sorted(set([np.random.randint(len(y_pre)) for _ in range(n_samples)]))
        sample_indices = fpsample.bucket_fps_kdline_sampling(X_test, n_samples=n_samples, h=9) # Farthest Point Sampling (FPS) is better!!!
        X_test = X_test[sample_indices]; y_test = y_test[sample_indices]
        # FPS + k-DPP
        DPP = FiniteDPP('likelihood', **{'L': X_test.dot(X_test.T)})
        DPP.flush_samples()
        for _ in range(n_samples//(len(bs))):
            DPP.sample_exact_k_dpp(size=len(bs))
        sample_indices = np.unique(np.ravel(DPP.list_of_samples))
        X_test = X_test[sample_indices]; y_test = y_test[sample_indices]
        
        manager = stepwise_selective_inference(support_size=X_test.shape[1])
        M, p_list = manager.inference(X_test, y_test, np.std(y_test))
        if false_discovery_control_method is not None:
            p_list = stats.false_discovery_control(p_list, method=false_discovery_control_method)
        
        fdrs.append(subset_fdr(p_list))
        
    fdrs = np.array(fdrs)
    if fdrs.mean() < 1:
        print(len(bs), fdrs.mean())
        fdr_data.append(fdrs)
        
fdr_data = np.array(fdr_data)

100%|██████████████████████████████████████████████████████████████████| 50/50 [01:07<00:00,  1.35s/it]


1 0.0


100%|██████████████████████████████████████████████████████████████████| 50/50 [01:17<00:00,  1.55s/it]


2 0.0


100%|██████████████████████████████████████████████████████████████████| 50/50 [01:38<00:00,  1.96s/it]


3 0.000554271195338672


100%|██████████████████████████████████████████████████████████████████| 50/50 [01:36<00:00,  1.92s/it]


4 0.05729145796714814


100%|██████████████████████████████████████████████████████████████████| 50/50 [01:44<00:00,  2.10s/it]


5 0.34270588881384206


100%|██████████████████████████████████████████████████████████████████| 50/50 [01:53<00:00,  2.28s/it]


6 0.12672865590114468


100%|██████████████████████████████████████████████████████████████████| 50/50 [01:46<00:00,  2.14s/it]


7 0.29843403583142236


100%|██████████████████████████████████████████████████████████████████| 50/50 [01:59<00:00,  2.40s/it]

8 0.4581499546466102





In [12]:
from sklearn.cluster import AffinityPropagation, KMeans
print(AffinityPropagation().fit(fdr_data).labels_)
print(KMeans(n_clusters=2).fit(fdr_data).labels_)

[0 0 0 0 1 0 1 2]
[0 0 0 0 1 0 1 1]


### Selective inference

In [13]:
n_terms = 16
max_complexity = 10
alphas = [0.3, 0.25, 0.2, 0.15, 0.1, 0.05, 0.01]

### OPTION-I ###
_, lars_p, _ = lars_path(StandardScaler().fit_transform(X_pre), y_pre.flatten(), method='lasso', alpha_min=1e-6, max_iter=1000)
lars_p = np.array(list(map(int, lars_p)))[:n_terms]

### OPTION-II ###
# nonzero = np.nonzero(AbessLinearRegression(s_min=1, s_max=n_terms, path_type='gs', fit_intercept=False, alpha=1e-9, max_iter=100).fit(X_pre, y_pre.flatten()).coef_)[0]
# nonzero = np.nonzero(MIOSR(X_pre, y_pre, alpha=1e-9, non_zero=min(len(nonzero), n_terms)))[0]
# _, lars_p, _ = lars_path(StandardScaler().fit_transform(X_pre[:, nonzero]), y_pre.flatten(), method='lasso', alpha_min=0)
# lars_p = nonzero[np.array(list(map(int, lars_p)))][:n_terms]

np.random.seed(1234)
max_fdr = 0
for _ in range(50):
    sub_indices = sorted(np.random.random_integers(len(y_pre), size=10000))
    X_test = X_pre[sub_indices, :][:, lars_p]
    y_test = y_pre[sub_indices]
    sigma = np.std(y_test-X_test@np.linalg.lstsq(X_test, y_test)[0], ddof=1)
    manager = stepwise_selective_inference(support_size=len(lars_p))
    _, p_list = manager.inference(X_test, y_test, sigma)
    # print(lars_p, p_list, subset_fdr(p_list))
    
    for alpha in alphas:
        adjusted_pvalues = p_list
        stop_step, false_discovery_rates = forward_stop_rule(adjusted_pvalues, alpha)
        adjusted_pvalues = adjusted_pvalues[:stop_step+1]
        rejections = np.sort(lars_p[:stop_step+1])
        if len(rejections) <= max_complexity: 
            break
    print(alpha)
    max_fdr += alpha

max_fdr = max_fdr/50
max_fdr

0.01
0.05
0.05
0.1
0.05
0.01
0.15
0.05
0.01
0.3
0.1
0.01
0.01
0.05
0.05
0.01
0.1
0.2
0.01
0.05
0.05
0.05
0.1
0.1
0.01
0.01
0.01
0.1
0.15
0.05
0.15
0.01
0.01
0.01
0.01
0.01
0.1
0.01
0.05
0.01
0.3
0.1
0.01
0.01
0.01
0.05
0.01
0.15
0.1
0.1


0.06419999999999995