In [61]:
import numpy as np
import pandas as pd
import scipy.stats as stats
from boruta import BorutaPy
from numpy import ndarray
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split


In [2]:
def generate_data(n,p,k,scheme="one"):
    def scheme_one():
        y=np.zeros(n)
        X=np.zeros((n,p))
        for i in range(n):
            x=np.random.normal(0,1,p)
            X[i,:]=x
            th=stats.chi2.ppf(0.5,df=k)
            y[i]=1 if np.sum(x[0:k]**2)>th else 0
        return X,y
    def scheme_two():
        y=np.array(n)
        X=np.zeros((n,p))
        for i in range(n):
            x=np.random.normal(0,1,p)
            X[i,:]=x
            
            y[i]=1 if np.sum(np.abs(x[0:k]))>k else 0
        return X,y
    if scheme=="one":
        return scheme_one()
    else:
        return scheme_two()
        

In [4]:
N=[500,800,1000,2000]
PK=[(50,10),(25,5),(40,8),(100,25)]

In [9]:
i_n=np.random.randint(0,len(N))
i_pk=np.random.randint(0,len(PK))
X,y=generate_data(N[i_n],PK[i_pk][0],PK[i_pk][1],"one")
boruta=BorutaPy(estimator=RandomForestClassifier()).fit(X,y)
rf=RandomForestClassifier().fit(X,y)

In [62]:
X_=train_test_split(X,y)

[array([[ 1.23492188,  1.55484227, -1.2226281 , ...,  2.73859825,
         -0.46616456,  0.84199162],
        [ 0.86068505,  1.40610662,  1.05143207, ..., -0.58626787,
          0.65071531,  0.61148577],
        [ 0.0296373 ,  1.35488512, -1.03723087, ...,  0.47354886,
         -0.31495012,  0.79259074],
        ...,
        [ 1.01891171, -1.29573572,  1.6816942 , ...,  1.9727875 ,
         -0.15468154,  0.35212877],
        [ 1.93930235,  0.3548332 , -2.1661768 , ..., -0.43787975,
          0.86826442, -1.63119431],
        [-0.85700155, -1.71995066,  0.98551087, ...,  0.09460938,
         -0.17100561, -0.83431902]], shape=(375, 20)),
 array([[ 0.50156198, -0.60965304,  0.66190703, ...,  0.62934453,
         -0.52616639, -0.33223734],
        [-0.48336242,  0.39937875, -0.3850722 , ..., -0.6086688 ,
         -0.66122897,  0.71553487],
        [ 1.9970928 , -1.85002568,  0.82097947, ..., -0.0855701 ,
          0.59556957,  0.38403848],
        ...,
        [ 1.09600236, -1.70627176,  0

In [18]:
model=BorutaPy(estimator=RandomForestClassifier())

In [21]:
model.fit(X,y)

In [70]:
class VarImportanceRanker:
    def __init__(self,X:np.ndarray,y:np.ndarray,rf_args:dict={}):
        self.X:np.ndarray=X
        self.y:np.ndarray=y
        self.rf:RandomForestClassifier=RandomForestClassifier(**rf_args)
        self.boruta:BorutaPy=BorutaPy(RandomForestClassifier(**rf_args))
    def _impurity_rf(self):
        self.rf.fit(X,y)
        return self.rf.feature_importances_
    def _boruta(self):
        self.boruta.fit(X,y)
        return self.boruta.ranking_
    def _feat_perm(self):
        self.rf.fit(self.X,self.y)
        res=permutation_importance(self.rf,self.X,self.y)
        return res.importances_mean
    def rank(self,name="boruta"):
        if name=="impurity":
            return self._impurity_rf()
        if name=="feat_perm":
            return self._feat_perm()
        else:
            return self._boruta()
    
    
        

In [72]:
VarImportanceRanker(X,y).rank("feat_perm")

array([0.0672, 0.1072, 0.106 , 0.0544, 0.06  , 0.    , 0.    , 0.    ,
       0.    , 0.    , 0.    , 0.    , 0.    , 0.    , 0.    , 0.    ,
       0.    , 0.    , 0.    , 0.    ])

In [50]:
np.argsort(model.ranking_)

array([ 0,  1,  2,  3,  4, 18, 15,  6, 12,  7, 19, 16, 10,  5, 17, 14,  9,
        8, 11, 13])

In [67]:
X.shape

(500, 20)

In [59]:
rf.feature_importances_

array([0.10296484, 0.14744914, 0.14000227, 0.0991818 , 0.10532835,
       0.02512043, 0.02889843, 0.02432775, 0.0327687 , 0.02958913,
       0.02668252, 0.02762182, 0.03191953, 0.02333742, 0.02562006,
       0.02403458, 0.02468811, 0.02714689, 0.02721055, 0.02610766])