In [5]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE

In [2]:
data = pd.read_csv('german_credit.csv')

## Wrapper methods
- Wrapper methods evaluate subsets of variables which allows, unlike filter approaches, to detect the possible interactions between variables. 
The two main disadvantages of these methods are :

1. The increasing overfitting risk when the number of observations is insufficient.
2. The significant computation time when the number of variables is large.

source: https://en.wikipedia.org/wiki/Feature_selection#Filter_method

## Types of wrapper methods
- recursive feature elimination
- sequential feature selection algorithms
- genetic algorithms

Recursive feature elimination recursively removes features, builds a model using the remaining attributes and calculates model accuracy. RFE is able to work out the combination of attributes that contribute to the prediction on the target variable (or class). 

In [8]:
X, y = data.loc[:, data.columns!='Creditability'], data['Creditability']

clf = RandomForestClassifier(n_jobs=2, random_state=0)
# create the RFE model for a random forest classifier
# and select attributes
rfe = RFE(clf, 15)
rfe = rfe.fit(X, y)
# print summaries for the selection of attributes
print(rfe.support_)
print(rfe.ranking_)

[ True  True  True  True  True  True  True  True  True False  True  True
  True  True False  True  True False False False]
[1 1 1 1 1 1 1 1 1 2 1 1 1 1 4 1 1 5 3 6]


In [17]:
rfe.get_support()

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
       False,  True,  True,  True,  True, False,  True,  True, False,
       False, False])

In [19]:
print(X.columns)

Index(['Account Balance', 'Duration of Credit (month)',
       'Payment Status of Previous Credit', 'Purpose', 'Credit Amount',
       'Value Savings/Stocks', 'Length of current employment',
       'Instalment per cent', 'Sex & Marital Status', 'Guarantors',
       'Duration in Current address', 'Most valuable available asset',
       'Age (years)', 'Concurrent Credits', 'Type of apartment',
       'No of Credits at this Bank', 'Occupation', 'No of dependents',
       'Telephone', 'Foreign Worker'],
      dtype='object')


In [18]:
print(X.columns[rfe.get_support()]) 

Index(['Account Balance', 'Duration of Credit (month)',
       'Payment Status of Previous Credit', 'Purpose', 'Credit Amount',
       'Value Savings/Stocks', 'Length of current employment',
       'Instalment per cent', 'Sex & Marital Status',
       'Duration in Current address', 'Most valuable available asset',
       'Age (years)', 'Concurrent Credits', 'No of Credits at this Bank',
       'Occupation'],
      dtype='object')


dropped columns:
1. Guarantors
2. Type of apartment
3. No of dependents
4. Telephone
5. Foreign Worker

In [13]:
names = X.columns
print ("Features sorted by their rank:")
print (sorted(zip(map(lambda x: round(x, 4), rfe.ranking_), names)))

Features sorted by their rank:
[(1, 'Account Balance'), (1, 'Age (years)'), (1, 'Concurrent Credits'), (1, 'Credit Amount'), (1, 'Duration in Current address'), (1, 'Duration of Credit (month)'), (1, 'Instalment per cent'), (1, 'Length of current employment'), (1, 'Most valuable available asset'), (1, 'No of Credits at this Bank'), (1, 'Occupation'), (1, 'Payment Status of Previous Credit'), (1, 'Purpose'), (1, 'Sex & Marital Status'), (1, 'Value Savings/Stocks'), (2, 'Guarantors'), (3, 'Telephone'), (4, 'Type of apartment'), (5, 'No of dependents'), (6, 'Foreign Worker')]
