# Feature Selection

In [None]:
import pandas as pd
import numpy as np
from IPython.display import display

data = pd.read_csv('data_processed/complete/enc_data.csv')

_data = data.copy()

pd.set_option('display.max_columns', None)

display(data.head())

SELECT_WRAPPED = True
REMOVE_HIGH_CORRELATED = False

In [None]:
def get_features(df):
    return df.drop('status', axis=1)
def get_target(df):
    return df.drop(df.columns.difference(['status']), axis=1)

### Remove high correlated

In [None]:
if REMOVE_HIGH_CORRELATED:
    cor_matrix = data.corr().abs()
    display(cor_matrix)

    upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape),k=1).astype(np.bool))
    display(upper_tri)

    cols = list(upper_tri.columns)
    cols.reverse()
    to_drop = [column for column in cols if any(upper_tri[column] > 0.80)]
    display(to_drop)

    data.drop(to_drop, axis=1, inplace=True)
    display(data.head())



### Remove unecessary features

In [None]:
no_ids = [c for c in data.columns if c[-3:] != '_id' and c != 'code']
data = data[no_ids]
# data.drop(['account_frequency', 'gender', 'card_type'], axis=1, inplace=True)

display(data.head())
display(data.info())

### Define model

In [None]:
from imblearn.pipeline import Pipeline as imbpipeline
from imblearn.over_sampling import SMOTE

def make_pipeline(classifier, smote = False, scaler = None):
    steps = []
    if smote: steps.append(['smote', SMOTE(random_state=11)])
    if scaler: steps.append(['scaler', scaler])
    steps.append(['classifier', classifier])
    return imbpipeline(steps = steps)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier

estimator = make_pipeline(RandomForestClassifier(), smote = True)

## Feature Selection
### Wrapper Method
Split Data into subsets and train a model using this. Based on the output of the model, add or subtract features and train the model again.
#### 1. Forward Selection

In [None]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
import matplotlib.pyplot as plt

x = get_features(data)
y = data['status']

def forward_selection(features, target, k_features = 20):
    sfs = SFS(estimator,
              k_features=k_features,
              forward=True,
              floating=False,
              cv = 5,
              scoring = 'roc_auc',
              n_jobs = -1)
    sfs = sfs.fit(features, target)
    fig1 = plot_sfs(sfs.get_metric_dict(), kind='std_dev')
    plt.title('Sequential Forward Selection')
    plt.grid()
    plt.show()
    return pd.DataFrame.from_dict(sfs.get_metric_dict()).T

if SELECT_WRAPPED:
    forward_selection_result = forward_selection(x, y)
    forward_selection_result

#### 2. Backward Elimination

In [None]:
def backward_selection(features, target, k_features = 1):
    sbs = SFS(estimator,
              k_features=k_features,
              forward=False,
              floating=False,
              scoring = 'roc_auc',
              cv = 5,
              n_jobs = -1)
    sbs = sbs.fit(features, target)
    fig1 = plot_sfs(sbs.get_metric_dict(), kind='std_dev')
    plt.title('Backward Elimination')
    plt.grid()
    plt.show()
    return pd.DataFrame.from_dict(sbs.get_metric_dict()).T

if SELECT_WRAPPED:
    backward_selection_result = backward_selection(x, y)
    backward_selection_result

#### 3. Bi-directional Elimination (Step-wise Selection)

In [None]:
def bi_directional_elimination(features, target, k_features = (1,20)):
    sffs = SFS(estimator,
              k_features=k_features,
              forward=True,
              floating=True,
              scoring = 'roc_auc',
              cv = 5,
              n_jobs = -1)
    sffs = sffs.fit(features, target)
    fig1 = plot_sfs(sffs.get_metric_dict(), kind='std_dev')
    plt.title('Bi-directional Elimination')
    plt.grid()
    plt.show()
    return pd.DataFrame.from_dict(sffs.get_metric_dict()).T

if SELECT_WRAPPED:
    bi_directional_elimination_result = bi_directional_elimination(x, y)
    bi_directional_elimination_result

### Overview of the methods
#### Comparison

In [None]:
if SELECT_WRAPPED:
    # Change 'avg_score' to float
    forward_selection_result['avg_score'] = forward_selection_result['avg_score'].astype(float)
    backward_selection_result['avg_score'] = backward_selection_result['avg_score'].astype(float)
    bi_directional_elimination_result['avg_score'] = bi_directional_elimination_result['avg_score'].astype(float)

    # Find the best result for each method
    a = forward_selection_result.iloc[forward_selection_result['avg_score'].idxmax()]
    b = backward_selection_result.iloc[backward_selection_result['avg_score'].idxmax()]
    c = bi_directional_elimination_result.iloc[bi_directional_elimination_result['avg_score'].idxmax()]

    c_df = pd.DataFrame({
        'method': ['forward selection', 'backward elimination', 'bi-directional'],
        'feature_names': [a['feature_names'], b['feature_names'], c['feature_names']],
        'n_features': [len(a['feature_names']), len(b['feature_names']), len(c['feature_names'])],
        'avg_score': [a['avg_score'], b['avg_score'], c['avg_score']]  
    })

    display(c_df)

#### Selection

In [None]:
if SELECT_WRAPPED:
    best_features_names = c_df.iloc[c_df['avg_score'].idxmax()]['feature_names']
    print('Best features: ', best_features_names)
    
    best_feature = []
    for feature_name in best_features_names:
        best_feature.append(data[feature_name])
    best_feature = pd.DataFrame(best_feature).T
    best_feature

## Save

In [None]:
features = features = ['loan_amount', 'loan_payments', 'has_disponent', 'mean_balance', 'min_balance', 'monthly_diff', 'account_frequency_monthly issuance', 'account_frequency_weekly issuance']
if SELECT_WRAPPED:
    features = list(best_features_names)

features_selected = pd.concat([data[features], _data['status']], axis=1)
features_selected = pd.concat([_data['loan_id'], features_selected], axis=1)
features_selected.to_csv('data_processed/complete/data_selected.csv', index=False)