## 1. Feature Selection Techniques - Dropping Constant Features

In this process we will be removing the features which have constant features which are actually not important for solving the problem statement

In [1]:
# Import pandas to create Dataframe
import pandas as pd

# Make Dataframe of the given data 
data = pd.DataFrame({"A":[1,2,4,1,2,4],
                    "B":[4,5,6,7,8,9],
                    "C":[0,0,0,0,0,0],
                    "D":[1,1,1,1,1,1]})

In [2]:
data

Unnamed: 0,A,B,C,D
0,1,4,0,1
1,2,5,0,1
2,4,6,0,1
3,1,7,0,1
4,2,8,0,1
5,4,9,0,1


Here in our data, Column C and D has zero variance means there is no change in the value of C and D columns

### Variance Threshold
Feature selector that removes all low-variance features or constant features <br><br>
This feature selection algorithm looks only at the features (X), not the desired output(y), and can thus be used for unsupervised learning

In [3]:
### It will remove zero variance features
from sklearn.feature_selection import VarianceThreshold
var_thres = VarianceThreshold(threshold = 0)
var_thres.fit(data)

In [4]:
var_thres.get_support()

array([ True,  True, False, False])

This indicates that <br>
Columns A : True (Non Zero Varaince)<br>
Columns B : True (Non Zero Varaince)<br>
Columns C : False (Zero Varaince)<br>
Columns D : False (Zero Varaince)<br>
<br>
So Columns C and D can be removed because both are zero variance columns

In [5]:
data.columns[var_thres.get_support()]

Index(['A', 'B'], dtype='object')

In [6]:
constant_columns = [col for col in data.columns if col not in data.columns[var_thres.get_support()]]
print(constant_columns,'-->',len(constant_columns))

['C', 'D'] --> 2


In [7]:
data.drop(constant_columns,axis = 1)

Unnamed: 0,A,B
0,1,4
1,2,5
2,4,6
3,1,7
4,2,8
5,4,9


### Variance Threshold on Bigger Dataset [santander-customer-satisfaction]

In [8]:
import pandas as pd
from sklearn.feature_selection import VarianceThreshold

In [9]:
df = pd.read_csv(r"G:\Udemy\DATA SCIENCE ineuron\Resources\Dataset\santander-customer-satisfaction Train.csv",nrows=10000)
df.head()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17,0
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.03,0
2,4,2,23,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.77,0
3,8,2,37,0.0,195.0,195.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64007.97,0
4,10,2,39,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016,0


In [10]:
print(df.shape)
df.columns

(10000, 371)


Index(['ID', 'var3', 'var15', 'imp_ent_var16_ult1', 'imp_op_var39_comer_ult1',
       'imp_op_var39_comer_ult3', 'imp_op_var40_comer_ult1',
       'imp_op_var40_comer_ult3', 'imp_op_var40_efect_ult1',
       'imp_op_var40_efect_ult3',
       ...
       'saldo_medio_var33_hace2', 'saldo_medio_var33_hace3',
       'saldo_medio_var33_ult1', 'saldo_medio_var33_ult3',
       'saldo_medio_var44_hace2', 'saldo_medio_var44_hace3',
       'saldo_medio_var44_ult1', 'saldo_medio_var44_ult3', 'var38', 'TARGET'],
      dtype='object', length=371)

We have rows : 10000 and features : 371

### Divide the data into dependent and independent features

In [11]:
X = df.drop('TARGET',axis = 1)
y = df['TARGET']

Variance Threshold is only applied on the independent features

In [12]:
from sklearn.model_selection import train_test_split

# seperate dataset into train and test

x_train, x_test,y_train,y_test = train_test_split(X,y,test_size = 0.3,random_state = 0)
x_train.shape,x_test.shape

((7000, 370), (3000, 370))

### Lets apply the variance threshold

In [13]:
var_thres = VarianceThreshold(threshold=0)
var_thres.fit(x_train)

In [14]:
# Finding non constant features
sum(var_thres.get_support())

284

In [16]:
len(x_train.columns[var_thres.get_support()])

284

In [15]:
# What are the non-constant features
x_train.columns[var_thres.get_support()]

Index(['ID', 'var3', 'var15', 'imp_ent_var16_ult1', 'imp_op_var39_comer_ult1',
       'imp_op_var39_comer_ult3', 'imp_op_var40_comer_ult1',
       'imp_op_var40_comer_ult3', 'imp_op_var40_efect_ult1',
       'imp_op_var40_efect_ult3',
       ...
       'saldo_medio_var29_ult3', 'saldo_medio_var33_hace2',
       'saldo_medio_var33_hace3', 'saldo_medio_var33_ult1',
       'saldo_medio_var33_ult3', 'saldo_medio_var44_hace2',
       'saldo_medio_var44_hace3', 'saldo_medio_var44_ult1',
       'saldo_medio_var44_ult3', 'var38'],
      dtype='object', length=284)

Total non constant features are 284, means 370-284 = 86 columns are having constant values

In [17]:
constant_columns = [col for col in x_train.columns if col not in x_train.columns[var_thres.get_support()]]
print(len(constant_columns),'--',constant_columns)

86 -- ['ind_var2_0', 'ind_var2', 'ind_var13_medio_0', 'ind_var13_medio', 'ind_var18_0', 'ind_var18', 'ind_var27_0', 'ind_var28_0', 'ind_var28', 'ind_var27', 'ind_var34_0', 'ind_var34', 'ind_var41', 'ind_var46_0', 'ind_var46', 'num_var13_medio_0', 'num_var13_medio', 'num_var18_0', 'num_var18', 'num_var27_0', 'num_var28_0', 'num_var28', 'num_var27', 'num_var34_0', 'num_var34', 'num_var41', 'num_var46_0', 'num_var46', 'saldo_var13_medio', 'saldo_var18', 'saldo_var28', 'saldo_var27', 'saldo_var34', 'saldo_var41', 'saldo_var46', 'delta_imp_amort_var18_1y3', 'delta_imp_amort_var34_1y3', 'delta_imp_reemb_var17_1y3', 'delta_imp_reemb_var33_1y3', 'delta_imp_trasp_var17_out_1y3', 'delta_imp_trasp_var33_out_1y3', 'delta_num_reemb_var17_1y3', 'delta_num_reemb_var33_1y3', 'delta_num_trasp_var17_out_1y3', 'delta_num_trasp_var33_out_1y3', 'imp_amort_var18_hace3', 'imp_amort_var18_ult1', 'imp_amort_var34_hace3', 'imp_amort_var34_ult1', 'imp_var7_emit_ult1', 'imp_reemb_var13_hace3', 'imp_reemb_var17_ha

In [20]:
x_train = x_train.drop(constant_columns,axis = 1)

In [21]:
x_train

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var29_ult3,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38
7681,15431,2,42,840.0,4477.02,4989.54,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37491.21
9031,18181,2,31,0.0,52.32,52.32,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,106685.94
3691,7411,2,51,0.0,0.00,0.00,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,66144.66
202,407,2,36,0.0,0.00,0.00,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,92121.36
5625,11280,2,23,0.0,0.00,0.00,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,74650.83
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9225,18564,2,33,0.0,0.00,0.00,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117547.89
4859,9723,2,24,0.0,0.00,0.00,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,71050.83
3264,6557,2,24,0.0,0.00,0.00,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,141069.33
9845,19796,2,38,0.0,0.00,0.00,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,86412.15


### Summary
Now we get independent features having no constant feature using `VarianceThreshold`