## From https://www.kaggle.com/raviprakash438/wrapper-method-feature-selection

In [220]:
# reset variables
from IPython import get_ipython
get_ipython().magic('reset -sf') 

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold
from sklearn import preprocessing # To get MinMax Scaler function

# enable multiple outputs per cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# hide warnings
import warnings
warnings.filterwarnings('ignore')

In [221]:
# Read data file
df_train = pd.read_csv('./datasets/train_extracted.csv'
                 , sep=',', encoding='utf-8')

df_test = pd.read_csv('./datasets/test_extracted.csv'
                 , sep=',', encoding='utf-8')

df_valid = pd.read_csv('./datasets/valid_extracted.csv'
                 , sep=',', encoding='utf-8')

#just in case of NaN values
df_train = df_train.dropna();
df_test = df_test.dropna();
df_valid = df_valid.dropna();

In [222]:
# change label into binary classification
# pick out target data
df_train.label.loc[(df_train['label'] >= 0.5)] = 1;
df_train.label.loc[(df_train['label'] < 0.5)] = 0;
df_train_target = df_train['label']

df_test.label.loc[(df_test['label'] >= 0.5)] = 1;
df_test.label.loc[(df_test['label'] < 0.5)] = 0;
df_test_target = df_test['label']

df_valid.label.loc[(df_valid['label'] >= 0.5)] = 1;
df_valid.label.loc[(df_valid['label'] < 0.5)] = 0;
df_valid_target = df_valid['label']

In [223]:
#pick out feature data
df_train_feature = df_train.drop(['label', 'statement'], axis=1)
df_test_feature = df_test.drop(['label', 'statement'], axis=1)
df_valid_feature = df_valid.drop(['label', 'statement'], axis=1)

In [224]:
# remove low variance data
def variance_threshold(df, threshold=0.0):
    selector = VarianceThreshold(threshold)
    VT = selector.fit_transform(df)
    return df[df.columns[selector.get_support(indices=True)]]

# any column with a probability of having 0 variance above 0.8 will be eliminated
# any column with variance lower than 0.16 will be eliminated
df_train_feature = variance_threshold(df_train_feature, (.8 * (1 - .8)));

In [225]:
# remove the same low threshold features from test and valid set
feature_removed = list(set(df_test_feature.columns).difference(df_train_feature.columns))
df_test_feature = df_test_feature.drop(feature_removed, axis=1)
df_valid_feature = df_valid_feature.drop(feature_removed, axis=1)

In [226]:
# Initializing the MinMaxScaler function
min_max_scaler = preprocessing.MinMaxScaler()
#standard_scaler = preprocessing.StandardScaler()

# Scaling dataset keeping the columns name
df_train_feature_scaled = pd.DataFrame(min_max_scaler.fit_transform(df_train_feature), columns = df_train_feature.columns)
df_valid_feature_scaled = pd.DataFrame(min_max_scaler.fit_transform(df_valid_feature), columns = df_valid_feature.columns)
df_test_feature_scaled = pd.DataFrame(min_max_scaler.fit_transform(df_test_feature), columns = df_test_feature.columns)
#X_scaled = pd.DataFrame(standard_scaler.fit_transform(X), columns = X.columns)

In [227]:
# Splitting  up data, seting 80% for train and 20% for test.
x_train, x_test, y_train, y_test = train_test_split(
    df_train_feature_scaled, df_train_target, test_size=0.2, random_state=42)

In [228]:
x_train.shape,x_test.shape

((8165, 52), (2042, 52))

In [229]:
# find correlation between feature values
def correlation(dataset,threshold):
    col_corr=set() # set will contains unique values.
    corr_matrix=dataset.corr() #finding the correlation between columns.
    for i in range(len(corr_matrix.columns)): #number of columns
        for j in range(i):
            if abs(corr_matrix.iloc[i,j])>threshold: #checking the correlation between columns.
                colName=corr_matrix.columns[i] #getting the column name
                col_corr.add(colName) #adding the correlated column name heigher than threshold value.
    return col_corr #returning set of column names

In [230]:
# correlation variable (0.9) determines how closely correlated features have to be, to be discarded
col=correlation(x_train,0.9) 
print('Correlated columns:',col) 

Correlated columns: {'count_family', 'count_negate', 'count_quant', 'count_word', 'count_female', 'count_risk', 'count_space', 'count_sexual', 'count_cause', 'count_anx', 'count_health', 'count_adj', 'count_tentat', 'count_relativ', 'count_anger', 'count_auxverb', 'count_reward', 'count_male', 'count_swear', 'count_sad', 'count_leisure', 'count_social', 'count_affect', 'count_ingest', 'count_adverb', 'count_nonflu', 'count_assent', 'count_netspeak', 'count_achieve', 'count_insight', 'count_hear', 'count_death'}


In [231]:
#remove correlated columns
x_train.drop(columns=col,axis=1,inplace=True)
x_test.drop(columns=col,axis=1,inplace=True)

In [232]:
x_train.shape,x_test.shape

((8165, 20), (2042, 20))

## Forward feature selection

In [233]:
from mlxtend.feature_selection import SequentialFeatureSelector as sfs

from sklearn.ensemble import RandomForestRegressor

In [234]:
#k_features=10 (It will get top 10 features best suited for prediction)
#forward=True (Forward feature selection model)
#verbose=2 (It will show details output as shown below.)
#cv=5 (Kfold cross valiation: it will split the training set in 5 set and 4 will be using for training the model and 1 will using as validation)
#n_jobs=-1 (Number of cores it will use for execution.-1 means it will use all the cores of CPU for execution.)
#scoring='r2'(R-squared is a statistical measure of how close the data are to the fitted regression line)
model=sfs(RandomForestRegressor(),k_features=10,forward=True,verbose=2,cv=5,n_jobs=-1,scoring='r2')
model.fit(x_train,y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of  20 | elapsed:    2.3s remaining:    3.4s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    2.5s finished

[2020-04-01 06:49:38] Features: 1/10 -- score: 0.002133004579714792[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  19 | elapsed:    0.0s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  16 out of  19 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  19 out of  19 | elapsed:    0.2s finished

[2020-04-01 06:49:38] Features: 2/10 -- score: 0.001243410703994341[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  18 | elapsed:    0.0s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done  15 out of  18 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:    0.3s finished

[2020-04-01 06

SequentialFeatureSelector(clone_estimator=True, cv=5,
                          estimator=RandomForestRegressor(bootstrap=True,
                                                          criterion='mse',
                                                          max_depth=None,
                                                          max_features='auto',
                                                          max_leaf_nodes=None,
                                                          min_impurity_decrease=0.0,
                                                          min_impurity_split=None,
                                                          min_samples_leaf=1,
                                                          min_samples_split=2,
                                                          min_weight_fraction_leaf=0.0,
                                                          n_estimators='warn',
                                                          n_jobs=None,
  

In [235]:
#Get the selected feature index.
model.k_feature_idx_

(0, 1, 2, 9, 10, 13, 14, 15, 17, 18)

In [236]:
#Get the column name for the selected feature.
forward_feature = model.k_feature_names_
print(forward_feature)

('num_-', 'num_"', 'num_$', 'count_pronoun', 'count_they', 'count_interrog', 'count_discrep', 'count_certain', 'count_focuspast', 'count_money')


## Backward feature selection

In [237]:
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
from sklearn.ensemble import RandomForestRegressor

In [238]:
#k_features=10 (It will get top 10 features best suited for prediction)
#forward=False (Backward feature selection model)
#verbose=2 (It will show details output as shown below.)
#cv=5 (Kfold cross valiation: it will split the training set in 5 set and 4 will be using for training the model and 1 will using as validation)
#n_jobs=-1 (Number of cores it will use for execution.-1 means it will use all the cores of CPU for execution.)
#scoring='r2'(R-squared is a statistical measure of how close the data are to the fitted regression line)
backwardModel=sfs(RandomForestRegressor(),k_features=10,forward=False,verbose=2,cv=5,n_jobs=-1,scoring='r2')
#We will convert our training data into numpy array. If we will not convert it, model is not able to read some of the column names. 
backwardModel.fit(np.array(x_train),y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of  20 | elapsed:    1.3s remaining:    2.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    2.5s finished

[2020-04-01 06:49:46] Features: 19/10 -- score: -0.1395618112201002[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  19 | elapsed:    1.2s remaining:    2.7s
[Parallel(n_jobs=-1)]: Done  16 out of  19 | elapsed:    2.3s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done  19 out of  19 | elapsed:    2.4s finished

[2020-04-01 06:49:48] Features: 18/10 -- score: -0.13690560056805187[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  18 | elapsed:    1.2s remaining:    3.2s
[Parallel(n_jobs=-1)]: Done  15 out of  18 | elapsed:    2.2s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:    2.2s finished

[2020-04-01 0

SequentialFeatureSelector(clone_estimator=True, cv=5,
                          estimator=RandomForestRegressor(bootstrap=True,
                                                          criterion='mse',
                                                          max_depth=None,
                                                          max_features='auto',
                                                          max_leaf_nodes=None,
                                                          min_impurity_decrease=0.0,
                                                          min_impurity_split=None,
                                                          min_samples_leaf=1,
                                                          min_samples_split=2,
                                                          min_weight_fraction_leaf=0.0,
                                                          n_estimators='warn',
                                                          n_jobs=None,
  

In [239]:
#Get the selected feature index.
backwardModel.k_feature_idx_

(3, 4, 6, 7, 8, 10, 12, 13, 14, 16)

In [240]:
#Get the column name for the selected feature.
backward_feature = x_train.columns[list(backwardModel.k_feature_idx_)]
print(backward_feature)

Index(['num_,', 'num_.', 'count_char_per_word', 'count_unique',
       'count_uppercase', 'count_they', 'count_compare', 'count_interrog',
       'count_discrep', 'count_affiliation'],
      dtype='object')


## Exhaustive Feature Selection

In [241]:
from mlxtend.feature_selection import ExhaustiveFeatureSelector as efs
#min_features=1 (minimum number of feature)
#max_features=5 (maximum number of feature)
#n_jobs=-1 (Number of cores it will use for execution.-1 means it will use all the cores of CPU for execution.)
#scoring='r2'(R-squared is a statistical measure of how close the data are to the fitted regression line)
emodel=efs(RandomForestRegressor(),min_features=5,max_features=10,scoring='r2',n_jobs=-1)
#Lets take only 10 features which we got from backward feature selection.
miniData=x_train[x_train.columns[list(backwardModel.k_feature_idx_)]]

emodel.fit(np.array(miniData),y_train)
#If you see below the model creates 637 feature combinations from 10 features.Thats why its computationally very expensive.

Features: 638/638

ExhaustiveFeatureSelector(clone_estimator=True, cv=5,
                          estimator=RandomForestRegressor(bootstrap=True,
                                                          criterion='mse',
                                                          max_depth=None,
                                                          max_features='auto',
                                                          max_leaf_nodes=None,
                                                          min_impurity_decrease=0.0,
                                                          min_impurity_split=None,
                                                          min_samples_leaf=1,
                                                          min_samples_split=2,
                                                          min_weight_fraction_leaf=0.0,
                                                          n_estimators='warn',
                                                          n_jobs=None,
  

In [242]:
#Get the selected feature index.
emodel.best_idx_

(0, 6, 7, 8, 9)

In [243]:
#Get the column name for the selected feature.
exhaustive_feature = miniData.columns[list(emodel.best_idx_)]
print(exhaustive_feature)

Index(['num_,', 'count_compare', 'count_interrog', 'count_discrep',
       'count_affiliation'],
      dtype='object')


In [244]:
print(forward_feature)

('num_-', 'num_"', 'num_$', 'count_pronoun', 'count_they', 'count_interrog', 'count_discrep', 'count_certain', 'count_focuspast', 'count_money')


In [245]:
print(backward_feature)

Index(['num_,', 'num_.', 'count_char_per_word', 'count_unique',
       'count_uppercase', 'count_they', 'count_compare', 'count_interrog',
       'count_discrep', 'count_affiliation'],
      dtype='object')


In [246]:
list(set(forward_feature).intersection(backward_feature))

['count_interrog', 'count_discrep', 'count_they']