# Feature selection

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold

from config_and_dependencies.config import *

pd.set_option('max.rows', None)
pd.set_option('max.columns', None)
%matplotlib inline

### load datasets

In [2]:
#%store -r train_data_formodel
#data = train_data_formodel
%store -r test_data
#%store -r my_data
#%store -r uniques

#Tom Added
%store -r MintCol

In [3]:
#Tom Added
data = test_data

In [4]:
test_data.head()

Unnamed: 0,ARSENIC,GOLD,COPPER,IRON,NICKEL,LEAD,ANTIMONY,TIN,ZINC,MANGANESE,CHROMIUM,COBALT,SILVER
0,0.085,0.08,71.7,0.003,0.0226,0.21,0.01,0.0035,0.0003,0.0001,0.0001,0.0154,27.87
1,0.016,0.16,75.43,0.008,0.0084,0.23,0.012,0.0012,0.0001,0.0001,0.0001,0.0003,24.14
2,0.012,0.17,74.1,0.004,0.0068,0.18,0.028,0.0012,0.0001,0.0001,0.0001,0.0001,25.5
3,0.009,0.59,12.2,0.002,0.0004,0.38,0.003,0.002,0.0002,0.0002,0.0002,0.0002,86.81
4,0.015,0.46,31.7,0.008,0.0047,0.2,0.079,0.0782,0.0009,0.0003,0.0006,0.0003,67.45


In [5]:
MintCol.head()

0    Alexandria
1    Alexandria
2    Alexandria
3      Caesarea
4      Caesarea
Name: MINT, dtype: object

In [6]:
#Rename MintCol Column to MINT
MintCol.columns = ['MINT']

In [7]:
#Combine MintCol and Data
data = pd.concat([MintCol, test_data], axis=1)
data.head()

Unnamed: 0,MINT,ARSENIC,GOLD,COPPER,IRON,NICKEL,LEAD,ANTIMONY,TIN,ZINC,MANGANESE,CHROMIUM,COBALT,SILVER
0,Alexandria,0.085,0.08,71.7,0.003,0.0226,0.21,0.01,0.0035,0.0003,0.0001,0.0001,0.0154,27.87
1,Alexandria,0.016,0.16,75.43,0.008,0.0084,0.23,0.012,0.0012,0.0001,0.0001,0.0001,0.0003,24.14
2,Alexandria,0.012,0.17,74.1,0.004,0.0068,0.18,0.028,0.0012,0.0001,0.0001,0.0001,0.0001,25.5
3,Caesarea,0.009,0.59,12.2,0.002,0.0004,0.38,0.003,0.002,0.0002,0.0002,0.0002,0.0002,86.81
4,Caesarea,0.015,0.46,31.7,0.008,0.0047,0.2,0.079,0.0782,0.0009,0.0003,0.0006,0.0003,67.45


In [8]:
#X = data[data.columns.values[1:]]
#X.head()

## configurations


In [9]:
save_plots = True

### Recursive feature elimination with 3-fold cross-validation is done using Random Forest Classifiers 

In [10]:
my_rfc = RandomForestClassifier(n_estimators=5, n_jobs=-5, random_state = random_seed_state)

In [11]:
my_rfc_selector = RFECV(estimator=my_rfc, step=1, cv=StratifiedKFold(3), verbose=2, scoring='f1_macro')

### set X and y as features and target respectively

In [12]:
X = data[data.columns.values[1:]]
y = data['MINT']

### check dimensions of features and target are as expected

In [13]:
print(X.shape)
print(y.shape)

(348, 13)
(348,)


### do recursive feature elimination

In [14]:
selector = my_rfc_selector.fit(X, y)

ValueError: Input contains NaN

### identify number of features with the best f1 score


In [None]:
my_rfc_selector.ranking_
feat_ranks = pd.DataFrame(data = {'features': data.columns.values[1:], 'rfe ranking': my_rfc_selector.ranking_} )
feat_ranks.sort_values(by = 'rfe ranking', inplace=True)
best_number_feats_rfe = my_rfc_selector.n_features_

In [None]:
best_number_feats_rfe = 5

### features ranked by their recursive feature elimination scores

In [None]:
feat_ranks.to_csv('Figure/feat_ranks.csv')

### select best features 

In [None]:
best_feats = list(feat_ranks['features'][0: best_number_feats_rfe])

### weighted-f1 score is plotted against number of features

In [None]:
RFCV_df = pd.DataFrame(data = {'Number of Features' : range(1, len(my_rfc_selector.grid_scores_) + 1), 'F1 Score': my_rfc_selector.grid_scores_})

In [None]:
sns.set(rc={'figure.figsize':(11.7,8.27)})
plot = sns.scatterplot(data = RFCV_df, x = 'Number of Features', y = 'F1 Score').set_title('Features evaluated by F1 score')
if save_plots:
    fig = plot.get_figure()
    fig.savefig('Figure/rfe.png')

In [None]:
RFCV_df.to_csv('Figure/RFCV_DF.csv')

In [None]:
print('the best features are {0}'.format(best_feats))

### the names of the best features and all features in order of recursive feature elimination rank are stored 

In [None]:
%store best_feats

In [None]:
len(best_feats)