- <a href='#1'>Prepare</a>  
- <a href='#2'>Feature Selection</a>
    - <a href='#2-1'>1. Filter</a>
        - <a href='#2-1-1'>1.1 Pearson Correlation</a>
        - <a href='#2-1-2'>1.2 Chi-2</a>
    - <a href='#2-2'>2. Wrapper</a>
    - <a href='#2-3'>3. Embeded</a>
        - <a href='#2-3-1'>3.1 Logistics Regression L1</a>
        - <a href='#2-3-2'>3.2 Random Forest</a>
        - <a href='#2-3-3'>3.3 LightGBM</a>
- <a href='#3'>Summary</a>

# <a id='1'>Prepare</a>

In [17]:
import pandas as pd
import numpy as np
import gc
import warnings
warnings.filterwarnings("ignore")
application_train = pd.read_csv('data/application_train.csv')

bur_cluster = pd.read_csv('feature/bur_cluster.csv')
bureau_feature = pd.read_csv('feature/bureau_feature.csv')
ccb_feature = pd.read_csv('feature/ccb_feature.csv')
ip_feature = pd.read_csv('feature/ip_feature.csv')
pcb_feature = pd.read_csv('feature/pcb_feature.csv')
pa_feature = pd.read_csv('feature/pa_feature.csv')
print(len(list(bur_cluster.columns)), len(bureau_feature.columns), len(ccb_feature.columns), len(ip_feature.columns), len(pcb_feature.columns), len(pa_feature.columns))

for i in [bur_cluster, bureau_feature, ccb_feature, ip_feature, pcb_feature, pa_feature]:
    application_train = application_train.join(i, how='left', on='SK_ID_CURR', lsuffix='_left',rsuffix='_right')
    application_train['SK_ID_CURR'] = application_train['SK_ID_CURR_left'].values
    del application_train['SK_ID_CURR_left']
    del application_train['SK_ID_CURR_right']


2 73 21 48 32 118


In [18]:
application_train.shape

(307511, 410)

### Stratified Sampling (ratio = 0.1)

In [19]:
application_sample1 = application_train.loc[application_train.TARGET==1].sample(frac=0.1, replace=False)
print('label 1 sample size:', str(application_sample1.shape[0]))
application_sample0 = application_train.loc[application_train.TARGET==0].sample(frac=0.1, replace=False)
print('label 0 sample size:', str(application_sample0.shape[0]))
application = pd.concat([application_sample1, application_sample0], axis=0).sort_values('SK_ID_CURR')

label 1 sample size: 2482
label 0 sample size: 28269


### Impute missing values

In [20]:
categorical_list = []
numerical_list = []
for i in application.columns.tolist():
    if application[i].dtype=='object':
        categorical_list.append(i)
    else:
        numerical_list.append(i)
print('Number of categorical features:', str(len(categorical_list)))
print('Number of numerical features:', str(len(numerical_list)))

Number of categorical features: 16
Number of numerical features: 394


In [21]:
from sklearn.impute import SimpleImputer
application[numerical_list] = application[numerical_list].replace([np.inf, -np.inf], np.nan)
application[numerical_list] = SimpleImputer(strategy='median').fit_transform(application[numerical_list])
# application[numerical_list] = SimpleImputer(missing_values=np.inf, strategy='median').fit_transform(application[numerical_list])

### Deal with Categorical features: OneHotEncoding

In [22]:
# 独热
del application_train; gc.collect()
application = pd.get_dummies(application, drop_first=True)
print(application.shape)

(30751, 515)


### Feature matrix and target

In [23]:
X = application.drop(['SK_ID_CURR', 'TARGET'], axis=1)
y = application.TARGET
feature_name = X.columns.tolist()

# <a id='2'>Feature Selection</a>
- select **100** features from 226
- **xxx_support**: list to represent select this feature or not
- **xxx_feature**: the name of selected features

## <a id='2-1'>1 Filter</a>
- documentation for **SelectKBest**: http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html

###  <a id='2-1-1'>1.1 Pearson Correlation</a>
**Note**
- Normalization: no
- Impute missing values: yes

In [24]:
def cor_selector(X, y):
    cor_list = []
    # calculate the correlation with y for each feature
    for i in X.columns.tolist():
        cor = np.corrcoef(X[i], y)[0, 1]
        cor_list.append(cor)
    # replace NaN with 0
    cor_list = [0 if np.isnan(i) else i for i in cor_list]
    # feature name
    cor_feature = X.iloc[:,np.argsort(np.abs(cor_list))[-100:]].columns.tolist()
    # feature selection? 0 for not select, 1 for select
    cor_support = [True if i in cor_feature else False for i in feature_name]
    return cor_support, cor_feature

In [25]:
cor_support, cor_feature = cor_selector(X, y)
print(str(len(cor_feature)), 'selected features')

100 selected features


###  <a id='2-1-2'>1.2 Chi-2</a>

**Note**
- Normalization: MinMaxScaler (values should be bigger than 0)
- Impute missing values: yes

In [26]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
X_norm = MinMaxScaler().fit_transform(X)
chi_selector = SelectKBest(chi2, k=100)
chi_selector.fit(X_norm, y)

In [27]:
application[numerical_list].replace(np.inf, np.nan).max().max()

100876407.92556468

In [28]:
application[numerical_list].max().max()

100876407.92556468

In [29]:
chi_support = chi_selector.get_support()
chi_feature = X.loc[:,chi_support].columns.tolist()
print(str(len(chi_feature)), 'selected features')

100 selected features


## <a id='2-2'>2. Wrapper</a>
- documentation for **RFE**: http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html

**Note**
- Normalization: depend on the used model; yes for LR
- Impute missing values: depend on the used model; yes for LR


In [30]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
rfe_selector = RFE(estimator=LogisticRegression(), n_features_to_select=100, step=10, verbose=5)
rfe_selector.fit(X_norm, y)

Fitting estimator with 513 features.
Fitting estimator with 503 features.
Fitting estimator with 493 features.
Fitting estimator with 483 features.
Fitting estimator with 473 features.
Fitting estimator with 463 features.
Fitting estimator with 453 features.
Fitting estimator with 443 features.
Fitting estimator with 433 features.
Fitting estimator with 423 features.
Fitting estimator with 413 features.
Fitting estimator with 403 features.
Fitting estimator with 393 features.
Fitting estimator with 383 features.
Fitting estimator with 373 features.
Fitting estimator with 363 features.
Fitting estimator with 353 features.
Fitting estimator with 343 features.
Fitting estimator with 333 features.
Fitting estimator with 323 features.
Fitting estimator with 313 features.
Fitting estimator with 303 features.
Fitting estimator with 293 features.
Fitting estimator with 283 features.
Fitting estimator with 273 features.
Fitting estimator with 263 features.
Fitting estimator with 253 features.
F

In [31]:
rfe_support = rfe_selector.get_support()
rfe_feature = X.loc[:,rfe_support].columns.tolist()
print(str(len(rfe_feature)), 'selected features')

100 selected features


## <a id='2-3'>3. Embeded</a>
- documentation for **SelectFromModel**: http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectFromModel.html
###  <a id='2-3-1'>3.1 Logistics Regression L1</a>
**Note**
- Normalization: Yes
- Impute missing values: Yes

In [32]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression

embeded_lr_selector = SelectFromModel(LogisticRegression(penalty="l2"),threshold='1.25*median')
embeded_lr_selector.fit(X_norm, y)

In [33]:
embeded_lr_support = embeded_lr_selector.get_support()
embeded_lr_feature = X.loc[:,embeded_lr_support].columns.tolist()
print(str(len(embeded_lr_feature)), 'selected features')

223 selected features


###  <a id='2-3-2'>3.2 Random Forest</a>
**Note**
- Normalization: No
- Impute missing values: Yes

In [34]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

embeded_rf_selector = SelectFromModel(RandomForestClassifier(n_estimators=100), threshold='1.25*median')
embeded_rf_selector.fit(X, y)

In [35]:
embeded_rf_support = embeded_rf_selector.get_support()
embeded_rf_feature = X.loc[:,embeded_rf_support].columns.tolist()
print(str(len(embeded_rf_feature)), 'selected features')

233 selected features


In [38]:
cor_support

[False,
 False,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 False,
 True,
 True,
 False,
 True,
 False,
 False,
 True,
 True,
 True,
 False,
 False,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 False,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 False,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 False,
 True,
 True,
 False,
 True,
 False,
 True,
 True,
 False,
 True,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 Fa

# <a id='3'>Summary</a>

In [39]:
pd.set_option('display.max_rows', None)
# put all selection together
feature_selection_df = pd.DataFrame({'Feature':cor_support, 'Pearson':cor_support, 'Chi-2':chi_support, 'RFE':rfe_support, 'Logistics':embeded_lr_support,
                                    'Random Forest':embeded_rf_support})
# count the selected times for each feature
feature_selection_df['Total'] = np.sum(feature_selection_df, axis=1)
# display the top 100
feature_selection_df = feature_selection_df.sort_values(['Total','Feature'] , ascending=False)
feature_selection_df.index = range(1, len(feature_selection_df)+1)
feature_selection_df.head(100).iloc[:,0].to_csv('selected_feature.csv', header=False, index=False)

In [40]:
feature_selection_df

Unnamed: 0,Feature,Pearson,Chi-2,RFE,Logistics,Random Forest,Total
1,True,True,True,True,True,True,6
2,True,True,True,True,True,True,6
3,True,True,True,True,True,True,6
4,True,True,True,True,True,True,6
5,True,True,True,True,True,True,6
6,True,True,True,True,True,True,6
7,True,True,True,True,True,True,6
8,True,True,True,True,True,True,6
9,True,True,True,True,True,True,6
10,True,True,True,True,True,True,6
