# Feature selection

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold

from config_and_dependencies.config import *

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
%matplotlib inline

### load datasets

In [4]:
%store -r train_data_formodel
data = train_data_formodel
%store -r test_data
%store -r my_data
%store -r uniques

## configurations


In [5]:
save_plots = True

### Recursive feature elimination with 3-fold cross-validation is done using Random Forest Classifiers 

In [6]:
my_rfc = RandomForestClassifier(n_estimators=5, n_jobs=-5, random_state = random_seed_state)

In [7]:
my_rfc_selector = RFECV(estimator=my_rfc, step=1, cv=StratifiedKFold(3), verbose=2, scoring='f1_macro')

### set X and y as features and target respectively

In [8]:
X = data[data.columns.values[9:-1]]
y = data['class']

### check dimensions of features and target are as expected

In [9]:
print(X.shape)
print(y.shape)

(696, 53)
(696,)


### do recursive feature elimination

In [10]:
selector = my_rfc_selector.fit(X, y)

Fitting estimator with 53 features.
Fitting estimator with 52 features.
Fitting estimator with 51 features.
Fitting estimator with 50 features.
Fitting estimator with 49 features.
Fitting estimator with 48 features.
Fitting estimator with 47 features.
Fitting estimator with 46 features.
Fitting estimator with 45 features.
Fitting estimator with 44 features.
Fitting estimator with 43 features.
Fitting estimator with 42 features.
Fitting estimator with 41 features.
Fitting estimator with 40 features.
Fitting estimator with 39 features.
Fitting estimator with 38 features.
Fitting estimator with 37 features.
Fitting estimator with 36 features.
Fitting estimator with 35 features.
Fitting estimator with 34 features.
Fitting estimator with 33 features.
Fitting estimator with 32 features.
Fitting estimator with 31 features.
Fitting estimator with 30 features.
Fitting estimator with 29 features.
Fitting estimator with 28 features.
Fitting estimator with 27 features.
Fitting estimator with 26 fe

### identify number of features with the best f1 score


In [11]:
my_rfc_selector.ranking_
feat_ranks = pd.DataFrame(data = {'features': data.columns.values[9:-1], 'rfe ranking': my_rfc_selector.ranking_} )
feat_ranks.sort_values(by = 'rfe ranking', inplace=True)
best_number_feats_rfe = my_rfc_selector.n_features_

In [12]:
best_number_feats_rfe = 15

### features ranked by their recursive feature elimination scores

In [13]:
feat_ranks.to_csv('output_datasets/feat_ranks.csv')

### select best features 

In [14]:
best_feats = list(feat_ranks['features'][0: best_number_feats_rfe])

### weighted-f1 score is plotted against number of features

In [58]:
RFCV_df = pd.DataFrame(data={'Number of Features': range(1, len(my_rfc_selector.cv_results_['mean_test_score']) + 1),
                            'F1 Score': my_rfc_selector.cv_results_['mean_test_score']})
#RFCV_df = pd.DataFrame(data = {'Number of Features' : range(1, len(my_rfc_selector.grid_scores_) + 1), 'F1 Score': my_rfc_selector.grid_scores_})
#RFCV_df = pd.DataFrame(data = {'Number of Features': range(1, len(my_rfc_selector.grid_scores_) + 1)}) 
##RFCV_df2 = pd.DataFrame.from_dict(data = {'F1 Score': my_rfc_selector.grid_scores_})
#print(RFCV_df2)

ValueError: Per-column arrays must each be 1-dimensional

In [18]:
# checktype = type(RFCV_df)
# RFCV_df = pd.Dataframe(
    
# NumFeatures = range(len(my_rfc_selector.grid_scores_) + 1)
# print(NumFeatures)
# 'Number of Features' : range
# NumFeatures = pd.DataFrame(NumFeatures)
# NumFeatures.columns['Number of Features']
#pd.show_versions()

In [64]:
grid_scoresdf = my_rfc_selector.grid_scores_
#'F1 Score': 
#print(grid_scoresdf)
#print(type(grid_scoresdf))
print(len(grid_scoresdf))

53


In [15]:
sns.set(rc={'figure.figsize':(11.7,8.27)})
plot = sns.scatterplot(data = RFCV_df, x = 'Number of Features', y = 'F1 Score').set_title('Features evaluated by F1 score')
if save_plots:
    fig = plot.get_figure()
    fig.savefig('figures/rfe.png')

NameError: name 'RFCV_df' is not defined

In [None]:
RFCV_df.to_csv('output_datasets/RFCV_DF.csv')

In [None]:
print('the best features are {0}'.format(best_feats))

### the names of the best features and all features in order of recursive feature elimination rank are stored 

In [None]:
%store best_feats

In [None]:
len(best_feats)