# Feature Selection

In [3]:
# !pip install sklearn

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv("BankChurners.csv")

## Pre-processing

In [4]:
def make_target(df, column):
    target_dummies = pd.get_dummies(df['Attrition_Flag'])
    df = pd.concat([df, target_dummies], axis = 1)
    return df

def get_int_columns(df, dtype):
    features = []
    for col, t in zip(df.columns, list(df.dtypes)):
        if t == dtype:
            features.append(col)
    return features

df = make_target(df, column = 'Attrition_flag')

target = 'Attrited Customer'
features = get_int_columns(df, dtype='int64')

y = df[target]
X = df[features]

## Filter Methods (Univariate Feature Selection)

### Correlation/ANOVA

In [39]:
correlation_threshold = 0.10

def correlation_selection(df,
                          features, 
                          target,
                          threshold):
    
    correlations = df[features + [target]].corr()[target]
    selected_features = correlations[abs(correlations)>threshold]
    
    remove_target = selected_features.index[selected_features.index != target]
    return selected_features[remove_target]

selected = correlation_selection(df,
                                 features,
                                 target,
                                 threshold = 0.10)

print(selected)

Total_Relationship_Count   -0.150005
Months_Inactive_12_mon      0.152449
Contacts_Count_12_mon       0.204491
Total_Revolving_Bal        -0.263053
Total_Trans_Amt            -0.168598
Total_Trans_Ct             -0.371403
Name: Attrited Customer, dtype: float64


### Chi-Squares, ANOVA, F-Test, Mutual Info Gain

In [38]:
from sklearn.feature_selection import (
    SelectKBest, 
    chi2, 
    f_classif, 
    f_regression,
    r_regression,
    mutual_info_classif,
    mutual_info_regression
)

kb = SelectKBest(r_regression, k=4)
X_new = kb.fit_transform(X,y)
X_new = pd.DataFrame(X_new)
X_new.columns = kb.get_feature_names_out()

X_new

Unnamed: 0,Customer_Age,Dependent_count,Months_Inactive_12_mon,Contacts_Count_12_mon
0,45,3,1,3
1,49,5,1,2
2,51,3,1,0
3,40,4,4,1
4,40,3,1,0
...,...,...,...,...
10122,50,2,2,3
10123,41,2,2,3
10124,44,1,3,4
10125,30,2,3,3


## Wrapper Methods

### Forward Stepwise

In [35]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression

kb = SequentialFeatureSelector(LogisticRegression(),
                               n_features_to_select=4,
                              direction = 'forward')
X_new = kb.fit_transform(X,y)
X_new = pd.DataFrame(X_new)
X_new.columns = kb.get_feature_names_out()

X_new

Unnamed: 0,Customer_Age,Dependent_count,Months_on_book,Contacts_Count_12_mon
0,45,3,39,3
1,49,5,44,2
2,51,3,36,0
3,40,4,34,1
4,40,3,21,0
...,...,...,...,...
10122,50,2,40,3
10123,41,2,25,3
10124,44,1,36,4
10125,30,2,36,3


### Backward Stepwise

In [36]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression

kb = SequentialFeatureSelector(LogisticRegression(),
                               n_features_to_select=4,
                              direction = 'backward')
X_new = kb.fit_transform(X,y)
X_new = pd.DataFrame(X_new)
X_new.columns = kb.get_feature_names_out()

X_new

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Unnamed: 0,Total_Relationship_Count,Contacts_Count_12_mon,Total_Revolving_Bal,Total_Trans_Amt
0,5,3,777,1144
1,6,2,864,1291
2,4,0,0,1887
3,3,1,2517,1171
4,5,0,0,816
...,...,...,...,...
10122,3,3,1851,15476
10123,4,3,2186,8764
10124,5,4,0,10291
10125,4,3,0,8395


In [45]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

kb = RFE(LogisticRegression(), n_features_to_select=4)
X_new = kb.fit_transform(X,y)
X_new = pd.DataFrame(X_new)
X_new.columns = kb.get_feature_names_out()

X_new

Unnamed: 0,CLIENTNUM,Total_Revolving_Bal,Total_Trans_Amt,Total_Trans_Ct
0,768805383,777,1144,42
1,818770008,864,1291,33
2,713982108,0,1887,20
3,769911858,2517,1171,20
4,709106358,0,816,28
...,...,...,...,...
10122,772366833,1851,15476,117
10123,710638233,2186,8764,69
10124,716506083,0,10291,60
10125,717406983,0,8395,62


RFE and Sequential are very similar. The difference is Sequential is using the cross validation score to remove features. RFE is using the importance of the feature.

### Exhaustive Stepwise

[Link to Mlxtend Docs](http://rasbt.github.io/mlxtend/user_guide/feature_selection/ExhaustiveFeatureSelector/)


In [41]:
# # Install a pip package in the current Jupyter kernel
# import sys
# !{sys.executable} -m pip install mlxtend

In [42]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_iris
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS

lr = LogisticRegression()

efs1 = EFS(lr, 
           min_features=1,
           max_features=4,
           scoring='accuracy',
           print_progress=True,
           cv=5)

efs1 = efs1.fit(X, y)

print('Best accuracy score: %.2f' % efs1.best_score_)
print('Best subset (indices):', efs1.best_idx_)
print('Best subset (corresponding names):', efs1.best_feature_names_)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Features: 385/385

Best accuracy score: 0.86
Best subset (indices): (4, 6, 7, 8)
Best subset (corresponding names): ('Total_Relationship_Count', 'Contacts_Count_12_mon', 'Total_Revolving_Bal', 'Total_Trans_Amt')


In [43]:
efs1.best_feature_names_

('Total_Relationship_Count',
 'Contacts_Count_12_mon',
 'Total_Revolving_Bal',
 'Total_Trans_Amt')

### Bi-Directional Elimination

Performs both forward and backward stepwise. First, it performs a step of forward stepwise, adding featuers that are significant. Then, it performs a backward elimination, removing any feature that is not insignificant. 

In [50]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

sbs = SFS(LogisticRegression(),
         k_features=4,
         forward=True,
         floating=True,
         cv=0)
sbs.fit(X, y)
sbs.k_feature_names_

('Total_Relationship_Count',
 'Months_Inactive_12_mon',
 'Contacts_Count_12_mon',
 'Total_Trans_Amt')

In [49]:
sbs.k_feature_names_

('Total_Relationship_Count',
 'Contacts_Count_12_mon',
 'Total_Revolving_Bal',
 'Total_Trans_Ct')

## Variance Threshold

In [57]:
from sklearn.feature_selection import VarianceThreshold

selector = VarianceThreshold()
selector.fit_transform(X)


array([[768805383,        45,         3, ...,       777,      1144,
               42],
       [818770008,        49,         5, ...,       864,      1291,
               33],
       [713982108,        51,         3, ...,         0,      1887,
               20],
       ...,
       [716506083,        44,         1, ...,         0,     10291,
               60],
       [717406983,        30,         2, ...,         0,      8395,
               62],
       [714337233,        43,         2, ...,      1961,     10294,
               61]])