In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import matplotlib.pyplot as plt
import scienceplots

import time
import math
import os
import random
from functools import partial
from decimal import Decimal
from collections import Counter
import numpy as np
import scipy.io as sio
import pysindy as ps
from tqdm import trange

from utils import *
from skimage.restoration import estimate_sigma
import bm3d
from solvel0 import solvel0
from best_subset import backward_refinement, brute_force_all_subsets
from UBIC import *
from kneed import KneeLocator
from bayesian_model_evidence import log_evidence

from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel

In [2]:
X_pre = np.load("./Cache/X_pre_nonconstant_GS_2025.npy")
uv_pre = np.load("./Cache/y_pre_nonconstant_GS_2025.npy")
feature_names = np.load("./Cache/nonconstant_feature_names_GS.npy")

# X_pre = np.load("./Cache/X_pre_GS_2025.npy")
# uv_pre = np.load("./Cache/y_pre_GS_2025.npy")
# feature_names = np.load("./Cache/feature_names_GS_2025.npy")

target_name = 'u'

# Ground truth
ground_indices_u = (0, 7, 11, 17, 25)
ground_coeff_u = np.array([-0.014, -1.000, 0.020, 0.020, 0.020])
ground_indices_v = (1, 7, 12, 18, 26)
ground_coeff_v = np.array([-0.067, 1.0, 0.01, 0.01, 0.01])

In [3]:
from sklearn.preprocessing import StandardScaler
from sklearn import covariance
from abess import LinearRegression as AbessLinearRegression
from knockpy import KnockoffFilter, knockoff_stats, knockoffs
from knockpy.utilities import estimate_covariance
from scipy import stats
from statsmodels.stats.multitest import multipletests
### extra ###
from c2st.check import c2st # https://github.com/psteinb/c2st

X_scale = StandardScaler().fit_transform(X_pre)
X_pre_top = X_scale.copy()
y_pre = StandardScaler().fit_transform(uv_pre)
if target_name == 'u':
    y_pre = y_pre[:, 0:1]
elif target_name == 'v':
    y_pre = y_pre[:, 1:2]
    
# lr = SCO(path_type='gs', sparsity=10, ic_method='LinearSIC')
lr = AbessLinearRegression(path_type='gs', s_max=12, fit_intercept=True, cv=5, screening_size=0)
fstat = knockoff_stats.Eli5PIStatistic(model=lr, n_iter=10)
# fstat = knockoff_stats.ShapStatistic(model=lr)
kfilter = KnockoffFilter(ksampler='gaussian', fstat=fstat, knockoff_kwargs={'method':'ci'})
# kfilter = KnockoffFilter(ksampler='gaussian', fstat='lasso', knockoff_kwargs={'method':'ci'})

fdr = 0.2
rejections = []
for _ in trange(50):
    rejection = kfilter.forward(X=X_pre_top, y=y_pre.flatten(), fdr=fdr, shrinkage="ledoitwolf", recycle_up_to=0.5, tol=1e-3)
    rejection = set(np.where(rejection == 1)[0])
    if len(rejection) > 0:
        rejections.append(rejection)

alibi is not installed in the environment.


100%|████████████████████████████████████████████████████████████████████████████████| 50/50 [01:26<00:00,  1.73s/it]


In [4]:
if target_name == 'u':
    assert set(ground_indices_u).issubset(biggest_superset(rejections))
elif target_name == 'v':
    assert set(ground_indices_v).issubset(biggest_superset(rejections))
rejections = np.array(sorted(biggest_superset(rejections)))
X_pre_top = X_pre_top[:, rejections]

In [5]:
alpha = 0.05
while True:
    non_null_indices, shap_values = shap_model_selection(X_pre_top, 
                                                         y_pre)
    scale_shap_values = abs(shap_values).mean(axis=0)
    rejections = rejections[non_null_indices]
    X_pre_top = X_pre_top[:, non_null_indices]
    # stop = -1
    stop = knee_finder(-np.cumsum(scale_shap_values))
    print(scale_shap_values)
    print(rejections)

    decision = True
    Sigma, invSigma = estimate_covariance(X_pre_top, 1e-3, "graphicallasso")
    for j in range(len(rejections)-1, stop, -1):
        classifier_confidences = []
        for _ in trange(50):
            Xk = knockoffs.GaussianSampler(X_pre_top, Sigma=Sigma, invSigma=invSigma, 
                                           method='ci').sample_knockoffs()
            Xn = X_pre_top.copy()
            Xn[:, j] = Xk[:, j]
            
            swap_explainer = shap.explainers.Linear(linear_model.LinearRegression(fit_intercept=False).fit(Xn, y_pre),
                                                    Xn)
            swap_shap_values = swap_explainer(Xn).values
            
            classifier_confidences.append(c2st(shap_values[:, j:j+1], swap_shap_values[:, j:j+1], clf=linear_model.LogisticRegression()))
    
        classifier_confidences = np.array(classifier_confidences)
        pv = stats.wilcoxon(classifier_confidences-0.51, alternative='greater').pvalue
        
        print("binary classifier's acc:", classifier_confidences.mean())
        print("P-value:", pv)
    
        if not pv < alpha:
            decision = False
            break

    if not decision:
        non_null_indices = list(solvel0(X_pre_top, y_pre, max_complexity=len(rejections)-1, miosr=True, refine=True)[-1])
        rejections = rejections[non_null_indices]
        X_pre_top = X_pre_top[:, non_null_indices]
    else:
        break

[0.30648507 0.21480066 0.17021094 0.02437514 0.02210213]
[17 25  7 11  0]


100%|████████████████████████████████████████████████████████████████████████████████| 50/50 [00:15<00:00,  3.26it/s]


binary classifier's acc: 0.5633311662649896
P-value: 3.777610438190336e-10


100%|████████████████████████████████████████████████████████████████████████████████| 50/50 [00:15<00:00,  3.24it/s]

binary classifier's acc: 0.5518376680721423
P-value: 3.77675632193688e-10





### Best-subset selection ###

In [6]:
X_pre_top = X_pre[:, rejections]
X_pre_top = X_pre_top/np.linalg.norm(X_pre_top, 2, axis=0)
if target_name == 'u':
    y_pre = uv_pre[:, 0:1]
elif target_name == 'v':
    y_pre = uv_pre[:, 1:2]
y_pre = y_pre/np.linalg.norm(y_pre, 2, axis=0)

In [7]:
_, best_subsets = brute_force_all_subsets(X_pre_top, y_pre)

100%|█████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 323.34it/s]
