diff --git a/doc/conf.py b/doc/conf.py index 18e874a..020f980 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -220,9 +220,6 @@ pyvista.OFF_SCREEN = False except Exception: pass -else: - brain_scraper = mne.viz._brain._BrainScraper() - scrapers += (brain_scraper, 'pyvista') if any(x in scrapers for x in ('pyvista')): from traits.api import push_exception_handler push_exception_handler(reraise_exceptions=True) diff --git a/doc/index.rst b/doc/index.rst index 0d06abd..3939a59 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -116,6 +116,18 @@ Application to source localization (MEG/EEG data): desparsified multi-task Lasso. In Proceedings of the 34th Conference on Neural Information Processing Systems (NeurIPS 2020), Vancouver, Canada. +Single/Group statistically validated importance using conditional permutations: + +* Chamma, A., Thirion, B., & Engemann, D. (2024). **Variable importance in +high-dimensional settings requires grouping**. In Proceedings of the 38th +Conference of the Association for the Advancement of Artificial +Intelligence(AAAI 2024), Vancouver, Canada. + +* Chamma, A., Engemann, D., & Thirion, B. (2023). **Statistically Valid Variable +Importance Assessment through Conditional Permutations**. In Proceedings of the +37th Conference on Neural Information Processing Systems (NeurIPS 2023), New +Orleans, USA. + If you use our packages, we would appreciate citations to the relevant aforementioned papers. diff --git a/examples/plot_diabetesFeatures_importance_example.py b/examples/plot_diabetesFeatures_importance_example.py new file mode 100644 index 0000000..d891dc9 --- /dev/null +++ b/examples/plot_diabetesFeatures_importance_example.py @@ -0,0 +1,101 @@ +""" +Variable Importance on diabetes dataset +======================================= + +This example compares the standard permutation approach for variable importance +and its conditional variant on the diabetes dataset for the single-level case. +""" + +############################################################################# +# Imports needed for this script +# ------------------------------ + +import numpy as np +from hidimstat.BBI import BlockBasedImportance +from sklearn.datasets import load_diabetes +import matplotlib.pyplot as plt +plt.rcParams.update({'font.size': 14}) + +# Fixing the random seed +rng = np.random.RandomState(2024) + +diabetes = load_diabetes() +X, y = diabetes.data, diabetes.target + +# Use or not a cross-validation with the provided learner +k_fold = 2 +# Identifying the categorical (nominal & ordinal) variables +list_nominal = {} + +############################################################################# +# Standard Variable Importance +# ---------------------------- + +bbi_perm = BlockBasedImportance( + estimator='RF', + importance_estimator="Mod_RF", + do_hyper=True, + dict_hyper=None, + conditional=False, + group_stacking=False, + prob_type="regression", + k_fold=k_fold, + list_nominal=list_nominal, + n_jobs=10, + verbose=0, + n_perm=100, +) +bbi_perm.fit(X, y) +print("Computing the importance scores with standard permutation") +results_perm = bbi_perm.compute_importance() +pvals_perm = -np.log10(results_perm["pval"] + 1e-10) + +############################################################################# +# Conditional Variable Importance +# ------------------------------- + +bbi_cond = BlockBasedImportance( + estimator='RF', + importance_estimator="Mod_RF", + do_hyper=True, + dict_hyper=None, + conditional=True, + group_stacking=False, + prob_type="regression", + k_fold=k_fold, + list_nominal=list_nominal, + n_jobs=10, + verbose=0, + n_perm=100, +) +bbi_cond.fit(X, y) +print("Computing the importance scores with conditional permutation") +results_cond = bbi_cond.compute_importance() +pvals_cond = -np.log10(results_cond["pval"] + 1e-5) + +############################################################################# +# Plotting the comparison +# ----------------------- + +list_res = {'Perm': [], 'Cond': []} +for ind_el, el in enumerate(diabetes.feature_names): + list_res['Perm'].append(pvals_perm[ind_el][0]) + list_res['Cond'].append(pvals_cond[ind_el][0]) + +x = np.arange(len(diabetes.feature_names)) +width = 0.25 # the width of the bars +multiplier = 0 +fig, ax = plt.subplots(figsize=(5, 5), layout='constrained') + +for attribute, measurement in list_res.items(): + offset = width * multiplier + rects = ax.bar(x + offset, measurement, width, label=attribute) + multiplier += 1 + +ax.set_ylabel(r'$-log_{10}p_{val}$') +ax.set_xticks(x + width/2, diabetes.feature_names) +ax.legend(loc='upper left', ncols=2) +ax.set_ylim(0, 3) +ax.axhline(y=-np.log10(0.05), color='r', linestyle='-') + +plt.show() diff --git a/examples/plot_meg_data_example.py b/examples_not_exhibited/plot_meg_data_example.py similarity index 99% rename from examples/plot_meg_data_example.py rename to examples_not_exhibited/plot_meg_data_example.py index fb955c4..fd92ac2 100644 --- a/examples/plot_meg_data_example.py +++ b/examples_not_exhibited/plot_meg_data_example.py @@ -100,7 +100,7 @@ def _load_somato(cond): # Get data paths data_path = somato.data_path() subject = '01' - subjects_dir = data_path + '/derivatives/freesurfer/subjects' + subjects_dir = data_path / '/derivatives/freesurfer/subjects' raw_fname = os.path.join(data_path, f'sub-{subject}', 'meg', f'sub-{subject}_task-{cond}_meg.fif') fwd_fname = os.path.join(data_path, 'derivatives', f'sub-{subject}', diff --git a/hidimstat/BBI.py b/hidimstat/BBI.py index cb70a3e..a37cd5d 100644 --- a/hidimstat/BBI.py +++ b/hidimstat/BBI.py @@ -16,7 +16,7 @@ roc_auc_score, r2_score, ) -from sklearn.model_selection import KFold +from sklearn.model_selection import KFold, GroupKFold from sklearn.pipeline import make_pipeline from sklearn.preprocessing import OneHotEncoder, StandardScaler from sklearn.utils.validation import check_is_fitted @@ -79,11 +79,12 @@ class BlockBasedImportance(BaseEstimator, TransformerMixin): Fixing the seeds of the random generator. com_imp: boolean, default=True Compute or not the importance scores. + group_label: list, default=None + The list of group labels to perform GroupKFold Attributes ---------- ToDO """ - def __init__( self, estimator=None, @@ -102,11 +103,13 @@ def __init__( verbose=0, groups=None, group_stacking=False, + sub_groups=None, k_fold=2, prop_out_subLayers=0, index_i=None, random_state=2023, com_imp=True, + group_fold=None, ): self.estimator = estimator self.importance_estimator = importance_estimator @@ -123,6 +126,7 @@ def __init__( self.n_jobs = n_jobs self.verbose = verbose self.groups = groups + self.sub_groups = sub_groups self.group_stacking = group_stacking self.k_fold = k_fold self.prop_out_subLayers = prop_out_subLayers @@ -139,6 +143,7 @@ def __init__( self.scaler_x = [None] * max(self.k_fold, 1) self.scaler_y = [None] * max(self.k_fold, 1) self.com_imp = com_imp + self.group_fold = group_fold # Check for applying the stacking approach with the RidgeCV estimator self.apply_ridge = False # Check for the case of a coffeine transformer with provided groups @@ -221,14 +226,14 @@ def fit(self, X, y=None): # number of variables provided list_count = [item for sublist in self.groups for item in sublist] if self.coffeine_transformer is None: - if len(list_count) != X.shape[1]: + if len(set(list_count)) != X.shape[1]: raise Exception("The provided groups are missing some variables!") else: if self.transformer_grp: - if len(list_count) != (X.shape[1] * self.coffeine_transformer[1]): + if len(set(list_count)) != (X.shape[1] * self.coffeine_transformer[1]): raise Exception("The provided groups are missing some variables!") else: - if len(list_count) != X.shape[1]: + if len(set(list_count)) != X.shape[1]: raise Exception("The provided groups are missing some variables!") # Check if categorical variables exist within the columns of the design @@ -319,6 +324,7 @@ def fit(self, X, y=None): current_grp += self.dict_cont[i] self.list_grps.append(current_grp) + # To check if len(self.coffeine_transformers) == 1: X = self.coffeine_transformers[0].fit_transform( pd.DataFrame(X, columns=self.X_cols), np.ravel(y)) @@ -406,12 +412,18 @@ def fit(self, X, y=None): if self.k_fold != 0: # Implementing k-fold cross validation as the default behavior - kf = KFold( - n_splits=self.k_fold, - random_state=self.random_state, - shuffle=True, - ) - for ind_fold, (train_index, test_index) in enumerate(kf.split(X)): + if self.group_fold: + kf = GroupKFold(n_splits=self.k_fold) + list_splits = kf.split(X, y, self.group_fold) + else: + kf = KFold( + n_splits=self.k_fold, + random_state=self.random_state, + shuffle=True, + ) + list_splits = kf.split(X) + + for ind_fold, (train_index, test_index) in enumerate(list_splits): print(f"Processing: {ind_fold+1}") X_fold = X.copy() y_fold = y.copy() @@ -697,11 +709,12 @@ def compute_importance(self, X=None, y=None): else: if self.coffeine_transformer is not None: X = self.coffeine_transformers[0].transform(pd.DataFrame(X, columns=self.X_cols)) - # Variables are provided as the third element of the - # coffeine transformer parameter - if len(self.coffeine_transformer) > 2: - X = X[:, self.coffeine_transformer[2]] - self.list_cont = np.arange(len(self.coffeine_transformer[2])) + if not self.transformer_grp: + # Variables are provided as the third element of the + # coffeine transformer parameter + if len(self.coffeine_transformer) > 2: + X = X[:, self.coffeine_transformer[2]] + self.list_cont = np.arange(len(self.coffeine_transformer[2])) # Perform stacking if enabled if self.apply_ridge: X_prev = X.copy() @@ -773,6 +786,7 @@ def compute_importance(self, X=None, y=None): index_i=ind_fold + 1, group_stacking=self.group_stacking, random_state=list_seeds_imp[perm], + verbose=self.verbose, ) for p_col in range(len(self.list_cols)) for perm in range(self.n_perm) @@ -812,9 +826,11 @@ def compute_importance(self, X=None, y=None): proc_col=p_col, index_i=ind_fold + 1, group_stacking=self.group_stacking, + sub_groups=[self.list_cols, self.sub_groups], list_seeds=list_seeds_imp, Perm=self.Perm, output_dim=output_dim, + verbose=self.verbose, ) for p_col in range(len(self.list_cols)) ) diff --git a/hidimstat/compute_importance.py b/hidimstat/compute_importance.py index 517547c..1be8496 100644 --- a/hidimstat/compute_importance.py +++ b/hidimstat/compute_importance.py @@ -1,6 +1,6 @@ import warnings from collections import Counter - +import itertools import numpy as np from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor from sklearn.metrics import mean_absolute_error, r2_score, roc_auc_score @@ -34,10 +34,12 @@ def joblib_compute_conditional( encoder={}, proc_col=None, index_i=None, + sub_groups=None, group_stacking=False, list_seeds=None, Perm=False, output_dim=1, + verbose=0, ): """This function applies the conditional approach for feature importance. Parameters @@ -86,7 +88,8 @@ def joblib_compute_conditional( Perm: bool, default=False The use of permutations or random sampling with CPI-DNN. output_dim: - + verbose: int, default=0 + If verbose > 0, the fitted iterations will be printed. """ rng = np.random.RandomState(seed) @@ -100,6 +103,25 @@ def joblib_compute_conditional( for X_test_el in X_test_list ] + el_ind = None + # Conditioning on a subset of variables + if sub_groups[1] is not None: + if (proc_col+1) in sub_groups[1].keys(): + # Get the indices of the provided variables/groups + el = [i-1 for i in sub_groups[1][proc_col+1]] + if not group_stacking: + el = list(itertools.chain(*[sub_groups[0][i] for i in el])) + el_ind = [] + for val in el: + if val in dict_nom.keys(): + el_ind += dict_nom[val] + if val in dict_cont.keys(): + el_ind += dict_cont[val] + el_ind = list(set(el_ind)) + else: + # Needs to be tested with multi-output neurones + el_ind = el.copy() + Res_col = [None] * len(current_X_test_list) X_col_pred = { "regression": [None] * len(current_X_test_list), @@ -194,13 +216,16 @@ def joblib_compute_conditional( if importance_estimator != "Mod_RF": if var_type["regression"]: for counter_test, X_test_comp in enumerate(current_X_test_list): - X_test_minus_idx = np.delete( - np.copy(X_test_comp), - p_col_n["regression"] - + p_col_n["classification"] - + p_col_n["ordinal"], - -1, - ) + if el_ind is not None: + X_test_minus_idx = np.copy(X_test_comp)[..., el_ind] + else: + X_test_minus_idx = np.delete( + np.copy(X_test_comp), + p_col_n["regression"] + + p_col_n["classification"] + + p_col_n["ordinal"], + -1, + ) # Nb of y outputs x Nb of samples x Nb of regression outputs output["regression"] = X_test_comp[..., p_col_n["regression"]] @@ -236,11 +261,14 @@ def joblib_compute_conditional( # is the same without the stacking part (where extra sub-linear layers are used), therefore identical inputs # won't need looping classification process. This is not the case with the regression part. if var_type["classification"]: - X_test_minus_idx = np.delete( - np.copy(current_X_test_list[0]), - p_col_n["regression"] + p_col_n["classification"] + p_col_n["ordinal"], - -1, - ) + if el_ind is not None: + X_test_minus_idx = np.copy(current_X_test_list[0])[..., el_ind] + else: + X_test_minus_idx = np.delete( + np.copy(current_X_test_list[0]), + p_col_n["regression"] + p_col_n["classification"] + p_col_n["ordinal"], + -1, + ) output["classification"] = np.array(X_nominal[grp_nom]) X_col_pred["classification"] = [] for cur_output_ind in range(X_test_minus_idx.shape[0]): @@ -264,11 +292,14 @@ def joblib_compute_conditional( X_col_pred["classification"] = [X_col_pred["classification"]] if var_type["ordinal"]: - X_test_minus_idx = np.delete( - np.copy(current_X_test_list[0]), - p_col_n["regression"] + p_col_n["classification"] + p_col_n["ordinal"], - -1, - ) + if el_ind is not None: + X_test_minus_idx = np.copy(current_X_test_list[0])[..., el_ind] + else: + X_test_minus_idx = np.delete( + np.copy(current_X_test_list[0]), + p_col_n["regression"] + p_col_n["classification"] + p_col_n["ordinal"], + -1, + ) output["ordinal"] = ordinal_encode(np.array(X_nominal[grp_ord])) X_col_pred["ordinal"] = [] for cur_output_ind in range(X_test_minus_idx.shape[0]): @@ -293,13 +324,16 @@ def joblib_compute_conditional( else: for counter_test, X_test_comp in enumerate(current_X_test_list): if var_type["regression"]: - X_test_minus_idx = np.delete( - np.copy(X_test_comp), - p_col_n["regression"] - + p_col_n["classification"] - + p_col_n["ordinal"], - -1, - ) + if el_ind is not None: + X_test_minus_idx = np.copy(X_test_comp)[..., el_ind] + else: + X_test_minus_idx = np.delete( + np.copy(X_test_comp), + p_col_n["regression"] + + p_col_n["classification"] + + p_col_n["ordinal"], + -1, + ) output["regression"] = X_test_comp[..., p_col_n["regression"]] for cur_output_ind in range(X_test_minus_idx.shape[0]): importance_models["regression"] = hypertune_predictor( @@ -318,11 +352,14 @@ def joblib_compute_conditional( ].sample_same_leaf(X_test_minus_idx[cur_output_ind, ...]) if var_type["classification"]: - X_test_minus_idx = np.delete( - np.copy(current_X_test_list[0]), - p_col_n["regression"] + p_col_n["classification"] + p_col_n["ordinal"], - -1, - ) + if el_ind is not None: + X_test_minus_idx = np.copy(current_X_test_list[0])[..., el_ind] + else: + X_test_minus_idx = np.delete( + np.copy(current_X_test_list[0]), + p_col_n["regression"] + p_col_n["classification"] + p_col_n["ordinal"], + -1, + ) output["classification"] = np.array(X_nominal[grp_nom]) for cur_output_ind in range(X_test_minus_idx.shape[0]): importance_models["classification"] = hypertune_predictor( @@ -340,11 +377,14 @@ def joblib_compute_conditional( ].sample_same_leaf(X_test_minus_idx[cur_output_ind, ...]) if var_type["ordinal"]: - X_test_minus_idx = np.delete( - np.copy(current_X_test_list[0]), - p_col_n["regression"] + p_col_n["classification"] + p_col_n["ordinal"], - -1, - ) + if el_ind is not None: + X_test_minus_idx = np.copy(current_X_test_list[0])[..., el_ind] + else: + X_test_minus_idx = np.delete( + np.copy(current_X_test_list[0]), + p_col_n["regression"] + p_col_n["classification"] + p_col_n["ordinal"], + -1, + ) output["ordinal"] = ordinal_encode(np.array(X_nominal[grp_ord])) for cur_output_ind in range(X_test_minus_idx.shape[0]): @@ -375,12 +415,13 @@ def joblib_compute_conditional( ) for sample in range(n_sample): - if index_i is not None: - print( - f"Iteration/Fold:{index_i}, Processing col:{proc_col+1}, Sample:{sample+1}" - ) - else: - print(f"Processing col:{proc_col+1}") + if verbose > 0: + if index_i is not None: + print( + f"Iteration/Fold:{index_i}, Processing col:{proc_col+1}, Sample:{sample+1}" + ) + else: + print(f"Processing col:{proc_col+1}") # Same shuffled indices across the sub-models items indices = np.arange(current_X_test_list[0].shape[1]) if importance_estimator != "Mod_RF": @@ -584,6 +625,7 @@ def joblib_compute_permutation( index_i=None, group_stacking=False, random_state=None, + verbose=0, ): """This function applies the permutation feature importance (PFI). @@ -620,15 +662,18 @@ def joblib_compute_permutation( Apply the stacking-based method for the provided groups. random_state: int, default=None Fixing the seeds of the random generator. + verbose: int, default=0 + If verbose > 0, the fitted iterations will be printed. """ rng = np.random.RandomState(random_state) - if index_i is not None: - print( - f"Iteration/Fold:{index_i}, Processing col:{proc_col+1}, Permutation:{perm+1}" - ) - else: - print(f"Processing col:{proc_col+1}, Permutation:{perm+1}") + if verbose > 0: + if index_i is not None: + print( + f"Iteration/Fold:{index_i}, Processing col:{proc_col+1}, Permutation:{perm+1}" + ) + else: + print(f"Processing col:{proc_col+1}, Permutation:{perm+1}") # A list of copied items to avoid any overlapping in the process current_X_test_list = [X_test_el.copy() for X_test_el in X_test_list] @@ -682,7 +727,6 @@ def joblib_compute_permutation( def hypertune_predictor(estimator, X, y, param_grid): - # param_grid = {"max_depth": [2, 5, 10]} grid_search = GridSearchCV(estimator, param_grid=param_grid, cv=2) grid_search.fit(X, y) return grid_search.best_estimator_ diff --git a/hidimstat/noise_std.py b/hidimstat/noise_std.py index dd1eb94..8158f6c 100644 --- a/hidimstat/noise_std.py +++ b/hidimstat/noise_std.py @@ -82,7 +82,7 @@ def reid(X, y, eps=1e-2, tol=1e-4, max_iter=10000, n_jobs=1, seed=0): def group_reid(X, Y, fit_Y=True, stationary=True, method='simple', order=1, - eps=1e-2, tol=1e-4, max_iter=1e4, n_jobs=1, seed=0): + eps=1e-2, tol=1e-4, max_iter=10000, n_jobs=1, seed=0): """Estimation of the covariance matrix using group Reid procedure diff --git a/hidimstat/utils.py b/hidimstat/utils.py index ab75b45..b4c00ae 100644 --- a/hidimstat/utils.py +++ b/hidimstat/utils.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -# Author: Binh Nguyen & Jerome-Alexis Chevalier +# Authors: Binh Nguyen & Jerome-Alexis Chevalier & Ahmad Chamma import copy import numpy as np import torch diff --git a/requirements.txt b/requirements.txt index 9815c1d..bc2c0cc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ ---extra-index-url https://download.pytorch.org/whl/cu118 +--extra-index-url https://download.pytorch.org/whl/cpu numpy joblib scipy diff --git a/setup.py b/setup.py index 87507a1..8214613 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ PKG = 'hidimstat' DESCRIPTION = "High-dimensional statistical inference tools for Python" LONG_DESCRIPTION = open('README.md').read() -MAINTAINER = 'Chevalier (ja-che), Nguyen (tbng), Alexandre Blain (alexblnn), Ahmad Chamma and Bertrand Thirion (bthirion)' +MAINTAINER = 'Chevalier (ja-che), Nguyen (tbng), Alexandre Blain (alexblnn), Ahmad Chamma (achamma723) and Bertrand Thirion (bthirion)' MAINTAINER_EMAIL = 'bertrand.thirion@inria.fr' URL = 'https://github.com/Parietal-INRIA/hidimstat' DOWNLOAD_URL = 'https://github.com/Parietal-INRIA/hidimstat'