# Feature selection

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold

pd.set_option('max.rows', None)
pd.set_option('max.columns', None)
%matplotlib inline

### load datasets

In [2]:
%store -r train_data_formodel
data = train_data_formodel
%store -r test_data
%store -r my_data
%store -r uniques

## configurations

* random_seed_state -> number, sets random state for model and for stratified splits 

In [3]:
random_seed_state = 42
save_plots = True

### Recursive feature elimination with 3-fold cross-validation is done using Random Forest Classifiers 

In [4]:
my_rfc = RandomForestClassifier(n_estimators=1000, n_jobs=-5, random_state = random_seed_state)

In [5]:
my_rfc_selector = RFECV(estimator=my_rfc, step=1, cv=StratifiedKFold(3), verbose=2, scoring='f1_macro')

### set X and y as features and target respectively

In [6]:
X = data[data.columns.values[9:-1]]
y = data['class']

### check dimensions of features and target are as expected

In [7]:
print(X.shape)
print(y.shape)

(786, 53)
(786,)


### do recursive feature elimination

In [8]:
selector = my_rfc_selector.fit(X, y)

Fitting estimator with 53 features.


  'precision', 'predicted', average, warn_for)


Fitting estimator with 52 features.
Fitting estimator with 51 features.


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2961, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-8-941d583624f1>", line 1, in <module>
    selector = my_rfc_selector.fit(X, y)
  File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/sklearn/feature_selection/rfe.py", line 514, in fit
    for train, test in cv.split(X, y, groups))
  File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/sklearn/feature_selection/rfe.py", line 514, in <genexpr>
    for train, test in cv.split(X, y, groups))
  File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/sklearn/feature_selection/rfe.py", line 32, in _rfe_single_fit
    X_train, y_train, lambda estimator, features:
  File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/sklearn/feature_selection/rfe.py", line 179, in _fit
    estimator.fit(X[:, features], y)
  File "/home/ubuntu/anaconda3/lib/python3.6/site

KeyboardInterrupt: 

### identify number of features with the best f1 score


In [None]:
my_rfc_selector.ranking_
feat_ranks = pd.DataFrame(data = {'features': data.columns.values[9:-1], 'rfe ranking': my_rfc_selector.ranking_} )
feat_ranks.sort_values(by = 'rfe ranking', inplace=True)
best_number_feats_rfe = my_rfc_selector.n_features_

In [None]:
best_number_feats_rfe = 25

### features ranked by their recursive feature elimination scores

In [None]:
feat_ranks

### select best features 

In [None]:
best_feats = list(feat_ranks['features'][0: best_number_feats_rfe])

### weighted-f1 score is plotted against number of features

In [None]:
RFCV_df = pd.DataFrame(data = {'Number of Features' : range(1, len(my_rfc_selector.grid_scores_) + 1), 'F1 Score': my_rfc_selector.grid_scores_})

In [None]:
sns.set(rc={'figure.figsize':(11.7,8.27)})
plot = sns.scatterplot(data = RFCV_df, x = 'Number of Features', y = 'F1 Score').set_title('Features evaluated by F1 score')
if save_plots:
    fig = plot.get_figure()
    fig.savefig('figures/rfe.png')

In [None]:
print('the best features are {0}'.format(best_feats))

### the names of the best features and all features in order of recursive feature elimination rank are stored 

In [None]:
%store best_feats