# Feature Selection

In [None]:
import pandas as pd
import numpy as np
from IPython.display import display

data = pd.read_csv('data_processed/data.csv')

pd.set_option('display.max_columns', None)

display(data.head())

In [None]:
def get_features(df):
    return df.drop('status', axis=1)
def get_target(df):
    return df.drop(df.columns.difference(['status']), axis=1)

### Remove unecessary features

In [None]:
no_ids = [c for c in data.columns if c[-3:] != '_id' and c != 'code']
data = data[no_ids]
# data.drop(['account_frequency', 'gender', 'card_type'], axis=1, inplace=True)

display(data.head())
display(data.info())

### Encode

In [None]:
def encode_data(df, columns):
    from sklearn.preprocessing import OrdinalEncoder
    le = OrdinalEncoder()
    cols = [col for col in columns if col in df.keys()]
    df[cols] = le.set_params(encoded_missing_value=-1).fit_transform(df[cols])
    return df

In [None]:
categorical_columns = list(data.select_dtypes("object").columns)
data = encode_data(data, categorical_columns)

data

## Feature Selection
### Wrapper Method
Split Data into subsets and train a model using this. Based on the output of the model, add or subtract features and train the model again.
#### 1. Forward Selection

In [None]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.ensemble import RandomForestClassifier
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
import matplotlib.pyplot as plt


originalFeatures = get_features(data)
x = originalFeatures
y = data['status']

def forward_selection(featuresData, target, k_features = 20):
    sfs = SFS(RandomForestClassifier(),
              k_features=k_features,
              forward=True,
              floating=False,
              cv = 5,
              scoring = 'roc_auc',
              n_jobs = -1)
    sfs = sfs.fit(x, y)
    fig1 = plot_sfs(sfs.get_metric_dict(), kind='std_dev')
    plt.title('Sequential Forward Selection')
    plt.grid()
    plt.show()
    return pd.DataFrame.from_dict(sfs.get_metric_dict()).T

forward_selection_result = forward_selection(x, y)
forward_selection_result

#### 2. Backward Elimination

In [None]:
def backward_selection(featuresData, target, k_features = 1):
    sbs = SFS(RandomForestClassifier(),
              k_features=1,
              forward=False,
              floating=False,
              scoring = 'roc_auc',
              cv = 5,
              n_jobs = -1)
    sbs = sbs.fit(x, y)
    fig1 = plot_sfs(sbs.get_metric_dict(), kind='std_dev')
    plt.title('Backward Elimination')
    plt.grid()
    plt.show()
    return pd.DataFrame.from_dict(sbs.get_metric_dict()).T

backward_selection_result = backward_selection(x, y)
backward_selection_result

#### 3. Bi-directional Elimination (Step-wise Selection)

In [None]:
def bi_directional_elimination(featuresData, target, k_features = 1):
    sffs = SFS(RandomForestClassifier(),
              k_features=(1,20),
              forward=True,
              floating=True,
              scoring = 'roc_auc',
              cv = 5,
              n_jobs = -1)
    sffs = sffs.fit(x, y)
    fig1 = plot_sfs(sffs.get_metric_dict(), kind='std_dev')
    plt.title('Bi-directional Elimination')
    plt.grid()
    plt.show()
    return pd.DataFrame.from_dict(sffs.get_metric_dict()).T

bi_directional_elimination_result = bi_directional_elimination(x, y)
bi_directional_elimination_result

### Overview of the methods
#### Comparison

In [None]:
# Change 'avg_score' to float
forward_selection_result['avg_score'] = forward_selection_result['avg_score'].astype(float)
backward_selection_result['avg_score'] = backward_selection_result['avg_score'].astype(float)
bi_directional_elimination_result['avg_score'] = bi_directional_elimination_result['avg_score'].astype(float)

# Find the best result for each method
a = forward_selection_result.iloc[forward_selection_result['avg_score'].idxmax()]
b = backward_selection_result.iloc[backward_selection_result['avg_score'].idxmax()]
c = bi_directional_elimination_result.iloc[bi_directional_elimination_result['avg_score'].idxmax()]

c_df = pd.DataFrame({
    'method': ['forward selection', 'backward elimination', 'bi-directional'],
    'feature_names': [a['feature_names'], b['feature_names'], c['feature_names']],
    'n_features': [len(a['feature_names']), len(b['feature_names']), len(c['feature_names'])],
    'avg_score': [a['avg_score'], b['avg_score'], c['avg_score']]  
})

display(c_df)

#### Selection

In [None]:
best_features_names = c_df.iloc[c_df['avg_score'].idxmax()]['feature_names']

best_feature = []
for feature_name in best_features_names:
    best_feature.append(data[feature_name])
best_feature = pd.DataFrame(best_feature).T
best_feature

## Save

In [None]:
# concatenate status to best_feature
best_feature = pd.concat([best_feature, data['status']], axis=1)
best_feature.to_csv('data_processed/data_selected.csv', index=False)