In [61]:
import pandas as pd
import numpy as np
import swifter
import seaborn as sns
import matplotlib.pyplot

pd.set_option('max.rows', None)
pd.set_option('max.columns', None)

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import VarianceThreshold, RFECV
from sklearn.model_selection import StratifiedKFold
from sklearn.neural_network import MLPClassifier
%matplotlib inline

### load datasets

In [62]:
%store -r train_data_formodel_transformed
data = train_data_formodel_transformed
%store -r test_data
%store -r my_data
%store -r uniques

In [63]:
tsne_dims_train = pd.read_csv('../dim_red_components/tsne_df_train.csv')
pca_dims_train = pd.read_csv('../dim_red_components/PC_df_train.csv')

## Univariate feature selection

### both train and test datasets are concatenated for the purposes of univariate feature selection

total_data = pd.concat([train_data[train_data.columns.values[8:-1]], test_data[test_data.columns.values[8:-1]]], axis = 0)

total_data.head()

### Features with a variance above a defined threshold are identified

my_variance_filter = VarianceThreshold(threshold=0.001)

high_variance_data = my_variance_filter.fit_transform(total_data)

high_variance_data_indexes = my_variance_filter.get_support()

high_variance_data_indexes

high_variance_col_names = total_data.columns.values[high_variance_data_indexes]

print(high_variance_col_names)

print('data shape before removal of low variance features')
print(total_data.shape)
print('data shape after removal of low variance features')
print(high_variance_data.shape)

# RFE, this isnt working yet, rankings all came out as 1

In [64]:
data.head()

Unnamed: 0,Analysis,Geology,Province,Region,Site,SubSite,Formation,Band,Nodule,Li7,Be9,B11,Mg24,Al27,Si28,P31,S33,K39,Ca42,Sc45,Ti47,V51,Cr52,Mn55,Fe56,Co59,Ni60,Cu63,Zn68,Ga69,Ge72,As75,Rb85,Sr88,Y89,Zr90,Nb93,Mo95,Cd111,In115,Sn118,Cs133,Ba137,La139,Ce140,Pr141,Nd146,Sm147,Eu153,Gd157,Tb159,Dy163,Ho165,Er166,Tm169,Yb172,Lu175,Hf178,Ta181,Pb208,Th232,U238,class
0,10_FH1_1_1,Bedrock,Northern,,FH,FH1,Burnham,FH1,FH1_1_1,2.811208,0.113329,48.36,5.047481,6.850878,1.005084e+17,3.937301,538.57,6.124552,6.570028,0.350657,2.808197,0.239017,1.458615,0.524729,2.247072,0.04879,0.587787,0.963174,2.469793,0.223144,0.797507,0.14842,0.357674,2.634762,0.631272,0.920283,0.086178,0.04879,0.019803,0.0,0.04879,0.00995,2.020222,0.609766,0.667829,0.207014,0.625938,0.14842,0.039221,0.14842,0.019803,0.10436,0.029559,0.058269,0.00995,0.019803,0.0,0.039221,0.00995,0.215111,0.067659,0.04879,0
1,11_FH1_1_1,Bedrock,Northern,,FH,FH1,Burnham,FH1,FH1_1_1,2.525729,0.086178,44.77,3.15359,6.982965,1.005517e+17,4.275415,438.2,5.963117,6.246572,0.364643,2.968875,0.254642,1.492904,0.698135,2.532903,0.10436,0.307485,0.425268,2.29556,0.29267,0.615186,0.09531,0.371564,2.654649,0.667829,1.007958,0.067659,0.00995,0.019803,0.0,0.039221,0.019803,2.201659,0.652325,0.698135,0.207014,0.683097,0.165514,0.039221,0.165514,0.019803,0.122218,0.029559,0.058269,0.00995,0.039221,0.00995,0.04879,0.0,0.067659,0.076961,0.039221,0
2,12_FH1_1_1,Bedrock,Northern,,FH,FH1,Burnham,FH1,FH1_1_1,3.046901,0.058269,44.88,3.777348,6.431669,1.007364e+17,4.658427,372.66,5.899103,6.865776,0.565314,3.039271,0.438255,1.446919,0.792993,4.488524,0.19062,0.985817,0.928219,2.56341,0.223144,0.996949,0.122218,0.357674,2.253395,0.625938,0.65752,0.09531,0.019803,0.019803,0.0,0.04879,0.00995,1.418277,0.641854,0.732368,0.231112,0.609766,0.139762,0.039221,0.173953,0.019803,0.131028,0.019803,0.067659,0.00995,0.058269,0.0,0.019803,0.00995,0.378436,0.04879,0.04879,0
3,13_FH1_1_2,Bedrock,Northern,,FH,FH1,Burnham,FH1,FH1_1_2,2.498152,0.548121,47.06,5.096324,7.042452,1.006094e+17,10.939673,1075.89,6.307278,7.684922,0.357674,3.768153,0.512824,5.033179,1.764731,4.985933,0.262364,1.238374,1.795087,2.898671,0.300105,1.141033,0.609766,0.565314,2.650421,0.678034,1.098612,0.09531,0.254642,0.165514,0.00995,0.576613,0.039221,2.276241,0.65752,0.667829,0.19062,0.559616,0.122218,0.039221,0.223144,0.019803,0.086178,0.029559,0.04879,0.0,0.029559,0.0,0.076961,0.0,0.494696,0.04879,0.029559,0
4,14_FH1_1_2,Bedrock,Northern,,FH,FH1,Burnham,FH1,FH1_1_2,2.929058,0.277632,48.26,3.541539,6.306677,1.005622e+17,3.816393,464.78,5.632107,7.347706,0.536493,2.499795,0.239017,1.269761,1.004302,3.272606,0.04879,0.587787,0.438255,2.379546,0.34359,0.879627,0.113329,0.24686,2.388763,0.641854,0.641854,0.076961,0.039221,0.09531,0.0,0.086178,0.00995,1.319086,0.678034,0.737164,0.239017,0.693147,0.157004,0.039221,0.173953,0.019803,0.139762,0.029559,0.04879,0.00995,0.04879,0.00995,0.019803,0.00995,0.463734,0.058269,0.086178,0


### Recursive feature elimination with cross-validation is done using a random forest as the estimator 

In [65]:
my_rfc = RandomForestClassifier(n_estimators=500, n_jobs=-1)



In [66]:
my_rfc_selector = RFECV(estimator=my_rfc, step=1, cv=StratifiedKFold(3), verbose=2, scoring='f1_macro')

In [67]:
data.columns.values[9:-1]

array(['Li7', 'Be9', 'B11', 'Mg24', 'Al27', 'Si28', 'P31', 'S33', 'K39',
       'Ca42', 'Sc45', 'Ti47', 'V51', 'Cr52', 'Mn55', 'Fe56', 'Co59',
       'Ni60', 'Cu63', 'Zn68', 'Ga69', 'Ge72', 'As75', 'Rb85', 'Sr88',
       'Y89', 'Zr90', 'Nb93', 'Mo95', 'Cd111', 'In115', 'Sn118', 'Cs133',
       'Ba137', 'La139', 'Ce140', 'Pr141', 'Nd146', 'Sm147', 'Eu153',
       'Gd157', 'Tb159', 'Dy163', 'Ho165', 'Er166', 'Tm169', 'Yb172',
       'Lu175', 'Hf178', 'Ta181', 'Pb208', 'Th232', 'U238'], dtype=object)

In [68]:
data = pd.concat([data, tsne_dims_train, pca_dims_train[['PC1', 'PC2', 'PC3']]], axis = 1)

In [69]:
X = data[data.columns.values[9:-1]]
y = data['class']

In [70]:
print(X.shape)
print(y.shape)

(1243, 59)
(1243,)


In [71]:
selector = my_rfc_selector.fit(X, y)

Fitting estimator with 59 features.


  'precision', 'predicted', average, warn_for)


Fitting estimator with 58 features.
Fitting estimator with 57 features.
Fitting estimator with 56 features.


  'precision', 'predicted', average, warn_for)


Fitting estimator with 55 features.


  'precision', 'predicted', average, warn_for)


Fitting estimator with 54 features.
Fitting estimator with 53 features.


  'precision', 'predicted', average, warn_for)


Fitting estimator with 52 features.


  'precision', 'predicted', average, warn_for)


Fitting estimator with 51 features.


  'precision', 'predicted', average, warn_for)


Fitting estimator with 50 features.


  'precision', 'predicted', average, warn_for)


Fitting estimator with 49 features.
Fitting estimator with 48 features.
Fitting estimator with 47 features.
Fitting estimator with 46 features.


KeyboardInterrupt: 

### The number of features with the best macro f1 score


In [None]:
my_RFECV_selector.ranking_
feat_ranks = pd.DataFrame(data = {'features': data.columns.values[9:-1], 'rfe ranking': my_RFECV_selector.ranking_} )


In [None]:
feat_ranks

In [None]:
my_RFECV_selector.n_features_

### macro-f1 score is plotted against number of features

In [None]:
RFCV_df = pd.DataFrame(data = {'no_features' : range(1, len(my_RFECV_selector.grid_scores_) + 1), 'scores': my_RFECV_selector.grid_scores_})

In [None]:
sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.lineplot(data = RFCV_df, x = 'no_features', y = 'scores')

### The best features are defined

In [None]:
best_feats = my_RFECV_selector.get_support()

In [None]:
best_feats = data.columns.values[9:-1][best_feats]
best_feats

In [None]:
%store best_feats