In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier

In [None]:
df_application_train = pd.read_csv("./data/application_train.csv")
df_application_test = pd.read_csv("./data/application_test.csv")

## Peform Basic Data Cleansing

In [None]:
# Drop columns with all NaN values
df_application_train = df_application_train.dropna(axis=1, how='all')

# Drop columns with more than 95% NaN values
threshold = len(df_application_train) * 0.95
df_application_train = df_application_train.dropna(axis=1, thresh=threshold)

# Drop NaN values
df_application_train = df_application_train.dropna()

df_application_train[df_application_train <
                     0] = df_application_train[df_application_train < 0] * -1

# One-hot encode categorical variables
df_application_train = pd.get_dummies(df_application_train)

X = df_application_train.drop(columns=['TARGET', 'SK_ID_CURR'])
y = df_application_train['TARGET']

## Select best features with `Univariate Selection`
> This method uses statistical tests to select features that have the strongest relationship with the output variable.

In [None]:
# Apply SelectKBest class to extract top 10 best features
best_features = SelectKBest(score_func=chi2, k=10)
fit = best_features.fit(X, y)

# Get the scores for each feature
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)

# Concat two dataframes for better visualization
feature_scores = pd.concat([dfcolumns, dfscores], axis=1)
feature_scores.columns = ['Feature', 'Score']
print(feature_scores.nlargest(10, 'Score'))

## Select best features with `Recursive Feature Elimination (RFE)`
> RFE works by recursively removing the least important features and building the model with the remaining features. It uses the model accuracy to identify which features contribute the most.


In [None]:
model = RandomForestClassifier()
model.fit(X, y)

# Get feature importances
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]

# Print the feature ranking
for f in range(X.shape[1]):
    print("%d. Column %s (%f)" %
          (f + 1, X.columns[indices[f]], importances[indices[f]]))

## Select best features with `Correlation Coefficient`
> It's is a measure of the linear relationship between 2 or more variables.

In [None]:
# Calculate the correlation matrix
corr_matrix = X.corr()

# Get the absolute correlation values with the target variable
important_features = corr_matrix['TARGET'].abs().sort_values(ascending=False)
important_features[:10]