In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [283]:
def MI_via_boruta(X, y, iter, model, seed=7, estimator=10, depth=None, boruta=True, mean_dec=False, feat_perm=False, learning_rate=0.01):
  """
  implementation of generalized Boruta idea for feature selection based on MI & Entropy via ensemble tree based models
  """

  """
  This function supports following models with: [following parameters that can be modified]:
  RandomForestClassifier          : [estimator, depth]
  RandomForestRegressor           : [estimator, depth]
  ExtraTreesClassifier            : [estimator, depth]
  AdaBoostClassifier              : [estimator]
  AdaBoostRegressor               : [estimator]
  GradientBoostingClassifier      : [estimator, depth, learning_rate]
  GradientBoostingRegressor       : [estimator, depth, learning_rate]
  HistGradientBoostingClassifier  : [estimator/max_iter, depth, learning_rate]
  HistGradientBoostingRegressor   : [estimator/max_iter, depth, learning_rate]

  #TODO: add configration for more parameters and more tree based models
  #TODO Later: Further optimization of tree based models via GridSearchCV for optimal parameters and model selection
  """

  """
  3 optionals are given here for selection of method for feature selection
  boruta : implemented below, Set to True by default
  mean_dec : based on mean decrease in impurity : via Scikit-Learn , False by default set to True for using this and set [boruta and feat_perm] = False
  feat_perm : based on feature permutation : via Scikit-Learn , False by default set to True for using this and set [boruta and mean_dec] = False
  """

  """
  X    : will take dataframe of features
  y    : will take label (both classification and regression "select model accordingly")
  iter : how many times for loop run for selected method of feature selection
  seed : random seed to start feature permutation
  ** estimator, depth, learning_rate can be used as default or selected as needed
  """

  """
  TODO: add checking of input data errors(type, size, shape)
  TODO Later: improve boruta feature selection strategy
  """

  """
  return a dataframe of features and there respective score/ranks depends on method used
  """


  """
  model selection
  """
  def select_model(model, estimator, depth, learning_rate):
    if model == 'RandomForestClassifier':
      from sklearn.ensemble import RandomForestClassifier
      return RandomForestClassifier(n_estimators=estimator, max_depth=depth)


    elif model == 'RandomForestRegressor':
      from sklearn.ensemble import RandomForestRegressor
      return RandomForestRegressor(n_estimators=estimator, max_depth=depth)


    elif model == 'ExtraTreesClassifier':
      from sklearn.ensemble import ExtraTreesClassifier
      return ExtraTreesClassifier(n_estimators=estimator, max_depth=depth)


    elif model == 'AdaBoostClassifier':
      from sklearn.ensemble import AdaBoostClassifier
      return AdaBoostClassifier(n_estimators=estimator)


    elif model == 'GradientBoostingClassifier':
      from sklearn.ensemble import GradientBoostingClassifier
      return GradientBoostingClassifier(n_estimators=estimator, max_depth=depth, learning_rate=learning_rate)


    elif model == 'GradientBoostingRegressor':
      from sklearn.ensemble import GradientBoostingRegressor
      return GradientBoostingRegressor(n_estimators=estimator, max_depth=depth, learning_rate=learning_rate)


    elif model == 'HistGradientBoostingClassifier': #preferred for big data samples as it works faster
      from sklearn.ensemble import HistGradientBoostingClassifier
      return HistGradientBoostingClassifier(max_iter=estimator, max_depth=depth, learning_rate=learning_rate)


    elif model == 'HistGradientBoostingRegressor':
      from sklearn.ensemble import HistGradientBoostingRegressor
      return HistGradientBoostingRegressor(max_iter=estimator, max_depth=depth, learning_rate=learning_rate)


    else:
      print('Please select model from sklearn.ensemble')
    

  """
  baruta basic idea implimentation, mean decrease in impurity & feature permutation
  
  """
  scores = np.zeros((len(X.columns)))
  if boruta == True:
    ## repeat based on no. of iteration specified
    for iter_ in range(iter):
      ## #creating new feature based on orignial
      np.random.seed(iter_)
      X_new = X.apply(np.random.permutation)
      X_new.columns = ['shadow_' + col for col in X.columns]
      X_boruta = pd.concat([X, X_shadow], axis = 1)
      X_boruta.replace(([np.inf, -np.inf], np.nan), inplace=True)
      X_boruta.fillna(0, inplace=True)
      ## fit a model as specified by user
      modl = select_model(model, estimator, depth, learning_rate)
      modl.fit(X_boruta, y)
      ### store feature importance for orignal and created features
      feat_imp_X = modl.feature_importances_[:len(X.columns)]
      feat_imp_new = modl.feature_importances_[len(X.columns):]
      ### compute scores for all trial and add to counter
      scores += (feat_imp_X > feat_imp_new.max())


  elif mean_dec == True:  #mean decrease in impurity # not a good fit for high cardinality features

    for iter_ in range(iter):

      np.random.seed(iter_)
      X_new = X.apply(np.random.permutation)
      X_new.columns = ['shadow_' + col for col in X.columns]
      X_mean_dec = pd.concat([X, X_shadow], axis = 1)
      X_mean_dec.replace(([np.inf, -np.inf], np.nan), inplace=True)
      X_mean_dec.fillna(0, inplace=True)
      
      ## fit a model as specified by user
      modl = select_model(model, estimator, depth, learning_rate)
      modl.fit(X_mean_dec, y)
      
      ### store feature importance for orignal and created features
      feat_imp_X = modl.feature_importances_[:len(X.columns)]
      feat_imp_new = modl.feature_importances_[len(X.columns):]
      
      ### compute scores for all trial and add to counter
      scores += (feat_imp_X > feat_imp_new.mean())


  elif feat_perm == True: #basic idea implimentation based on feature permutatio
    from sklearn.inspection import permutation_importance  #no bias toward high-cardinality features
    
    for iter_ in range(iter):
      
      np.random.seed(iter_)
      X_new = X.apply(np.random.permutation)
      X_new.columns = ['shadow_' + col for col in X.columns]
      X_feat_perm = pd.concat([X, X_shadow], axis = 1)
      X_feat_perm.replace(([np.inf, -np.inf], np.nan), inplace=True)
      X_feat_perm.fillna(0, inplace=True)
      
      ## fit a model as specified by user
      modl = select_model(model, estimator, depth, learning_rate)
      modl.fit(X_feat_perm, y)
      result = permutation_importance(modl, X_feat_perm, y)
      
      
      ### store feature importance for orignal and created features
      feat_imp_X = result.importances_mean[:len(X.columns)]
      feat_imp_new = result.importances_mean[len(X.columns):]
      
      ### compute scores for all trial and add to counter
      scores += (feat_imp_X > feat_imp_new.max())
  
  col = [col for col in X.columns]
  df_score = pd.DataFrame(list(zip(col, scores)), columns=['Feautre', 'score'])
  return df_score


In [None]:
!pip install pycaret #getting opensource datasets

In [13]:
from pycaret.datasets import get_data
df_classif = get_data('cancer') 
df_reg = get_data('insurance') 

Unnamed: 0,Class,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,0,5,1,1,1,2,1,3,1,1
1,0,5,4,4,5,7,10,3,2,1
2,0,3,1,1,1,2,2,3,1,1
3,0,6,8,8,1,3,4,3,7,1
4,0,4,1,1,3,2,1,3,1,1


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [14]:
df_reg1 = get_data('forest') 

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.0
1,7,4,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.0
2,7,4,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.0
3,8,6,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.0
4,8,6,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.0


#### converting categorical feature to numerical

In [22]:
df_reg['sex'] = pd.factorize(df_reg['sex'], sort=True)[0] + 1 #female = 1 , male = 2

In [23]:
df_reg['region'] = pd.factorize(df_reg['region'], sort=True)[0] + 1 #northeast = 1 , northwest = 2, southeast = 3, southwest = 4
df_reg['smoker'] = pd.factorize(df_reg['smoker'], sort=True)[0] + 1    #no = 0. yes=1
df_reg1['month'] = pd.factorize(df_reg1['month'], sort=True)[0] + 1 #by alphabetic order
df_reg1['day'] = pd.factorize(df_reg1['day'], sort=True)[0] + 1   #by alphabetic order

In [218]:
df_classif = df_classif.astype(np.float32)
df_reg = df_reg.astype(np.float32)
df_reg1 = df_reg1.astype(np.float32)

In [219]:
pd.set_option('precision', 2)

In [19]:
y_clf = df_classif.iloc[:, 0:1]
X_clf = df_classif.iloc[:, 1:]
y_clf.head()

Unnamed: 0,Class
0,0
1,0
2,0
3,0
4,0


In [24]:
X_reg = df_reg.iloc[:, 0:6]
y_reg = df_reg.iloc[:, 6:]
X_reg.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,1,27.9,0,2,4
1,18,2,33.77,1,1,3
2,28,2,33.0,3,1,3
3,33,2,22.705,0,1,2
4,32,2,28.88,0,1,2


In [222]:
X_reg1 = df_reg1.iloc[:, 0:12]
y_reg1 = df_reg1.iloc[:, 12:]
X_reg1.head()

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain
0,7.0,5.0,8.0,1.0,86.2,26.2,94.3,5.1,8.2,51.0,6.7,0.0
1,7.0,4.0,11.0,6.0,90.6,35.4,669.1,6.7,18.0,33.0,0.9,0.0
2,7.0,4.0,11.0,3.0,90.6,43.7,686.9,6.7,14.6,33.0,1.3,0.0
3,8.0,6.0,8.0,1.0,91.7,33.3,77.5,9.0,8.3,97.0,4.0,0.2
4,8.0,6.0,8.0,4.0,89.3,51.3,102.2,9.6,11.4,99.0,1.8,0.0


In [245]:
X_reg.dtypes

age         float32
sex         float32
bmi         float32
children    float32
smoker      float32
region      float32
dtype: object

In [247]:
np.where(xt.values >= np.finfo(np.float32).max)

(array([], dtype=int64), array([], dtype=int64))

In [243]:
np.isnan(X_reg.values.any())

False

In [286]:
import warnings 
warnings.filterwarnings('ignore')

In [287]:
reg_bruta = MI_via_boruta(X_reg, y_reg, iter=20, model='RandomForestRegressor', estimator=10)
#reg_meandec = MI_via_boruta(xt, yt, iter=20, model='RandomForestRegressor', estimator=10, boruta=False, mean_dec=True)

In [324]:
reg_bruta

Unnamed: 0,Feautre,score
0,age,20.0
1,sex,20.0
2,bmi,20.0
3,children,20.0
4,smoker,20.0
5,region,20.0


In [289]:
reg_bruta1 = MI_via_boruta(X_reg1, y_reg1, iter=20, model='RandomForestRegressor', estimator=10)

In [290]:
reg_bruta1

Unnamed: 0,Feautre,score
0,X,20.0
1,Y,20.0
2,month,20.0
3,day,20.0
4,FFMC,20.0
5,DMC,20.0
6,DC,20.0
7,ISI,20.0
8,temp,20.0
9,RH,20.0


In [291]:
clf_bruta = MI_via_boruta(X_clf, y_clf, iter=20, model='RandomForestClassifier', estimator=10)

In [292]:
clf_bruta

Unnamed: 0,Feautre,score
0,age,20.0
1,menopause,20.0
2,tumor-size,20.0
3,inv-nodes,20.0
4,node-caps,20.0
5,deg-malig,20.0
6,breast,20.0
7,breast-quad,20.0
8,irradiat,16.0


In [294]:
reg_meandec = MI_via_boruta(X_reg, y_reg, iter=20, model='RandomForestRegressor', estimator=10, boruta=False, mean_dec=True)
reg_meandec1 = MI_via_boruta(X_reg1, y_reg1, iter=20, model='RandomForestRegressor', estimator=10, boruta=False, mean_dec=True)
clf_meandec = MI_via_boruta(X_clf, y_clf, iter=20, model='RandomForestClassifier', estimator=10, boruta=False, mean_dec=True)

In [326]:
reg_meandec1

Unnamed: 0,Feautre,score
0,X,20.0
1,Y,20.0
2,month,20.0
3,day,20.0
4,FFMC,20.0
5,DMC,20.0
6,DC,20.0
7,ISI,20.0
8,temp,20.0
9,RH,20.0


In [325]:
clf_meandec

Unnamed: 0,Feautre,score
0,age,20.0
1,menopause,20.0
2,tumor-size,20.0
3,inv-nodes,20.0
4,node-caps,20.0
5,deg-malig,20.0
6,breast,20.0
7,breast-quad,20.0
8,irradiat,19.0


In [297]:
reg_meandec

Unnamed: 0,Feautre,score
0,age,20.0
1,sex,20.0
2,bmi,20.0
3,children,20.0
4,smoker,20.0
5,region,20.0


In [298]:
reg_featperm = MI_via_boruta(X_reg, y_reg, iter=20, model='RandomForestRegressor', estimator=10, boruta=False, feat_perm=True)
reg_featperm1 = MI_via_boruta(X_reg1, y_reg1, iter=20, model='RandomForestRegressor', estimator=10, boruta=False, feat_perm=True)
clf_featperm = MI_via_boruta(X_clf, y_clf, iter=20, model='RandomForestClassifier', estimator=10, boruta=False, feat_perm=True)

In [328]:
reg_featperm

Unnamed: 0,Feautre,score
0,age,20.0
1,sex,20.0
2,bmi,20.0
3,children,20.0
4,smoker,20.0
5,region,20.0


In [327]:
clf_featperm

Unnamed: 0,Feautre,score
0,age,20.0
1,menopause,20.0
2,tumor-size,20.0
3,inv-nodes,20.0
4,node-caps,20.0
5,deg-malig,20.0
6,breast,20.0
7,breast-quad,20.0
8,irradiat,17.0


In [301]:
reg_featperm1

Unnamed: 0,Feautre,score
0,X,20.0
1,Y,20.0
2,month,20.0
3,day,20.0
4,FFMC,20.0
5,DMC,20.0
6,DC,20.0
7,ISI,20.0
8,temp,20.0
9,RH,20.0


comparing with start of the art techniques

In [151]:
!pip install Boruta



In [305]:
from boruta import BorutaPy
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

rf = RandomForestClassifier(n_jobs=-1, max_depth=5)
rf1 = RandomForestRegressor(n_jobs=-1, max_depth=5)

# define Boruta feature selection method
feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=1)
feat_selector1 = BorutaPy(rf1, n_estimators='auto', random_state=1)

In [307]:
X_clf1 = X_clf.values
y_clf1 = y_clf.values

In [310]:
# find all relevant features - 5 features should be selected
feat_selector.fit(X_clf1, y_clf1)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	9
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	9
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	9
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	9
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	9
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	9
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	9
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	8
Tentative: 	1
Rejected: 	0
Iteration: 	9 / 100
Confirmed: 	8
Tentative: 	1
Rejected: 	0
Iteration: 	10 / 100
Confirmed: 	8
Tentative: 	1
Rejected: 	0
Iteration: 	11 / 100
Confirmed: 	8
Tentative: 	1
Rejected: 	0
Iteration: 	12 / 100
Confirmed: 	9
Tentative: 	0
Rejected: 	0


BorutaPy finished running.

Iteration: 	13 / 100
Confirmed: 	9
Tentative: 	0
Rejected: 	0


BorutaPy(estimator=RandomForestClassifier(max_depth=5, n_estimators=84,
                                          n_jobs=-1,
                                          random_state=RandomState(MT19937) at 0x7EFDBE4B8270),
         n_estimators='auto',
         random_state=RandomState(MT19937) at 0x7EFDBE4B8270, verbose=2)

In [309]:
feat_selector.ranking_

array([1, 1, 1, 1, 1, 1, 1, 1, 1])

In [311]:
X_regx = X_reg.values
y_regx = y_reg.values

X_regx1 = X_reg1.values
y_regx1 = y_reg1.values

In [312]:
feat_selector1.fit(X_regx, y_regx)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	6
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	6
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	6
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	6
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	6
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	6
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	6
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	3
Tentative: 	1
Rejected: 	2
Iteration: 	9 / 100
Confirmed: 	3
Tentative: 	1
Rejected: 	2
Iteration: 	10 / 100
Confirmed: 	3
Tentative: 	1
Rejected: 	2
Iteration: 	11 / 100
Confirmed: 	3
Tentative: 	1
Rejected: 	2
Iteration: 	12 / 100
Confirmed: 	3
Tentative: 	1
Rejected: 	2
Iteration: 	13 / 100
Confirmed: 	3
Tentative: 	1
Rejected: 	2
Iteration: 	14 / 100
Confirmed: 	3
Tentative: 	1
Rejected: 	2
Iteration: 	15 / 100
Confirmed: 	3
Tentative: 	1
Rejected: 	2
Iteration: 	16 / 100
Confirmed: 	3
Tentative: 	1
Rejected: 	2
Iteration: 	17 / 

BorutaPy(estimator=RandomForestRegressor(max_depth=5, n_estimators=56,
                                         n_jobs=-1,
                                         random_state=RandomState(MT19937) at 0x7EFDBE4B8050),
         n_estimators='auto',
         random_state=RandomState(MT19937) at 0x7EFDBE4B8050, verbose=2)

In [314]:
feat_selector1.ranking_

array([1, 4, 1, 2, 1, 3])

In [319]:
feat_selector1.fit(X_regx1, y_regx1)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	12
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	12
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	12
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	12
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	12
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	12
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	12
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	0
Tentative: 	1
Rejected: 	11
Iteration: 	9 / 100
Confirmed: 	0
Tentative: 	1
Rejected: 	11
Iteration: 	10 / 100
Confirmed: 	0
Tentative: 	1
Rejected: 	11
Iteration: 	11 / 100
Confirmed: 	0
Tentative: 	1
Rejected: 	11
Iteration: 	12 / 100
Confirmed: 	0
Tentative: 	1
Rejected: 	11
Iteration: 	13 / 100
Confirmed: 	0
Tentative: 	1
Rejected: 	11
Iteration: 	14 / 100
Confirmed: 	0
Tentative: 	1
Rejected: 	11
Iteration: 	15 / 100
Confirmed: 	0
Tentative: 	1
Rejected: 	11
Iteration: 	16 / 100
Confirmed: 	0
Tentative: 	1
Rejected: 	11
I

BorutaPy(estimator=RandomForestRegressor(max_depth=5, n_estimators=28,
                                         n_jobs=-1,
                                         random_state=RandomState(MT19937) at 0x7EFDBE4B8050),
         n_estimators='auto',
         random_state=RandomState(MT19937) at 0x7EFDBE4B8050, verbose=2)

In [320]:
feat_selector2.ranking_

array([ 6,  5, 12, 11, 10,  3,  9,  8,  2,  4,  6, 13])

#TODO_Future_Work: Mutual Information for Feature Selection via kernel density estimation of Gaussian Mixture Model

In [178]:
from sklearn import mixture
from sklearn.model_selection import train_test_split
from scipy.stats import multivariate_normal

In [179]:
def check_array(X, name="X", dim=2): 
    if not (type(X)==np.ndarray) or len(X.shape)!=dim:
            raise ValueError(name+" should be a {:}-dimensional Numpy array.".format(dim))

In [180]:
def gmm_model(X_train, X_val, k, covariance_type='full', reg_covar=1e-06, random_state=42):
    
    clf = mixture.GaussianMixture(n_components=k, covariance_type=covariance_type, reg_covar=reg_covar, random_state=random_state)
    clf.fit(X_train)
    return clf.score(X_val)

In [None]:
def check_gmm(X, y, y_cat=False, num_comps=[2,3,4,5], val_size=0.33, reg_covar=1e-06, random_state=42):

    covariance_type='full'
    
    #Checking input format
    check_array(X, name="X", dim=2)
    check_array(y, name="y", dim=1)
    
    #for classification
    if y_cat: 
        classes=list(set(y))
        gmm={}

"""
Pending
"""
        
    return gmm #it is a dictionary of GMMs
    
    #for regression
    else: 
        #Selecting number of components
"""
Pending
"""
        
        return gmm 

In [182]:
def MI_gmm_reg(X, y, gmm, feat, eps=10**-50):
    
'''
Pending
'''
    
    return {'mi':m}

In [183]:
x1 = np.array(X)
x2 = np.array(X_1)
y1 = np.array(y)

In [184]:
gmm_model= check_gmm(x2, y1)

In [185]:
gmm_model

GaussianMixture(n_components=2, random_state=42)

In [186]:
f = [ 0, 1, 2, 3, 4] #index of features to check change in MI
MI_gmm_reg(x2, y1, gmm, feat=f)

{'mi': 9.43366862090208, 'std': 0.4285194380902491}