## Variables selection

### Forward Feature Selection

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn import linear_model
import warnings
warnings.filterwarnings("ignore")
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS
from sklearn.feature_selection import RFE

In [3]:
# This is our dataset
data = pd.read_csv("../data/superstore_clean.csv")

In [4]:
seleccion_df = data.copy()

In [5]:
# Choose variables
X = seleccion_df.drop('Response', axis = 1)
y = seleccion_df.Response

In [6]:
# Model Initialization
sfs = SequentialFeatureSelector(RandomForestClassifier(n_estimators=100, random_state=0, n_jobs = -1),
          k_features = 7,
          forward= True,
          floating = False,
          verbose= 2,
          scoring= 'roc_auc',
          cv = 5
         )

In [7]:
# fit
selected_features = sfs.fit(X, y)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  28 out of  28 | elapsed:   10.8s finished

[2023-03-02 19:59:28] Features: 1/7 -- score: 0.6625802132513912[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:    9.5s finished

[2023-03-02 19:59:38] Features: 2/7 -- score: 0.6998808669858209[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  26 out of  26 | elapsed:    9.5s finished

[2023-03-02 19:59:47] Features: 3/7 -- score: 0.7252348540610815[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 

In [8]:
selected_features.k_feature_names_ 

('Marital_Status_Married',
 'Teenhome',
 'MntWines',
 'MntMeatProducts',
 'NumCatalogPurchases',
 'NumStorePurchases',
 'NumWebVisitsMonth')

In [9]:
pd.DataFrame.from_dict(sfs.get_metric_dict()).T

Unnamed: 0,feature_idx,cv_scores,avg_score,feature_names,ci_bound,std_dev,std_err
1,"(19,)","[0.6572844439221217, 0.6475255302435192, 0.694...",0.66258,"(NumCatalogPurchases,)",0.038033,0.029591,0.014795
2,"(9, 19)","[0.6693892741019312, 0.73442655145326, 0.72967...",0.699881,"(Teenhome, NumCatalogPurchases)",0.045105,0.035094,0.017547
3,"(9, 19, 21)","[0.7320092451130176, 0.762274155538099, 0.7313...",0.725235,"(Teenhome, NumCatalogPurchases, NumWebVisitsMo...",0.043933,0.034182,0.017091
4,"(6, 9, 19, 21)","[0.7603713714890116, 0.7427533385703063, 0.717...",0.733547,"(Marital_Status_Married, Teenhome, NumCatalogP...",0.025172,0.019584,0.009792
5,"(6, 9, 11, 19, 21)","[0.770615426802993, 0.7379222309505107, 0.7515...",0.747409,"(Marital_Status_Married, Teenhome, MntWines, N...",0.023683,0.018426,0.009213
6,"(6, 9, 11, 13, 19, 21)","[0.799858972852274, 0.7360369206598586, 0.8032...",0.771967,"(Marital_Status_Married, Teenhome, MntWines, M...",0.03538,0.027527,0.013763
7,"(6, 9, 11, 13, 19, 20, 21)","[0.8172719081756571, 0.7732325216025138, 0.796...",0.786302,"(Marital_Status_Married, Teenhome, MntWines, M...",0.025858,0.020118,0.010059


### Exhaustive feature selection

This is very expensive to run in terms of cpu usage. We can skip it.

In [10]:
# %%time
# efs = EFS(RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1),
#          min_features= 4,
#           max_features= 5,
#           scoring='recall',
#           cv = None,
#           n_jobs=-1
#          ).fit(X, y)

In [11]:
# efs.best_feature_names_

### RFE

In [12]:
# Iniciar el transformador
rfe = RFE(estimator=RandomForestClassifier(), n_features_to_select=10)

# Fit
rfe.fit(X, y)

In [13]:
from operator import itemgetter
features = X.columns.to_list()
for x, y in (sorted(zip(rfe.ranking_ , features), key=itemgetter(0))):
    print(x, y)


1 Income
1 Recency
1 MntWines
1 MntMeatProducts
1 MntFishProducts
1 MntSweetProducts
1 MntGoldProds
1 NumStorePurchases
1 NumWebVisitsMonth
1 Edad
2 NumCatalogPurchases
3 MntFruits
4 Dt_Customer_m
5 NumWebPurchases
6 NumDealsPurchases
7 tiempo_participacion
8 Marital_Status_Married
9 Dt_Customer_year
10 Teenhome
11 Dt_Customer_q
12 Marital_Status_Single
13 Education_PhD
14 Education_Graduation
15 Marital_Status_Divorced
16 Kidhome
17 Education_Master
18 Education_Basic
19 Complain
