# **Feature Selection- Dropping constant features**

In this step we will be removing the features which have constant features(Values) which are actually not important for solving the problem statement. 

In [1]:
# Import pandas to create DataFrame ...
import pandas as pd 
  
# Make DataFrame of the given data ...
data = pd.DataFrame({"A":[1,2,4,1,2,4], 
                    "B":[4,5,6,7,8,9], 
                    "C":[0,0,0,0,0,0],
                    "D":[1,1,1,1,1,1]}) 

In [2]:
data.head()
# Here Columns 'C' & 'D' have constant values, so we remove it, but we double check it using sklearn library ...

Unnamed: 0,A,B,C,D
0,1,4,0,1
1,2,5,0,1
2,4,6,0,1
3,1,7,0,1
4,2,8,0,1


**Variance Threshold** - **[Read More](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.VarianceThreshold.html)**

Feature selector that removes all low-variance features.

This feature selection algorithm looks only at the features (X), not the desired outputs (y), and can thus be used for unsupervised learning.

Constand features are of two types :
1. Constand Feature : Same value in all the records
2. Quasi Constand Feature : One of the value is dominant 99.9% in column.

In [3]:
### It will remove zero variance features ...
from sklearn.feature_selection import VarianceThreshold
var_thres = VarianceThreshold(threshold=0.0) # Change the threshold value : Quasi Constant...
var_thres.fit(data)

VarianceThreshold()

Use the **`get_constant_features`** functions to get all the constant features. : **[Read More](https://pypi.org/project/fast-ml/)**

In [4]:
var_thres.get_support()
# Here the True values means they don't have zero varience and Flase have zero varience ...

array([ True,  True, False, False])

In [5]:
data.columns[var_thres.get_support()]

Index(['A', 'B'], dtype='object')

In [6]:
constant_columns = [column for column in data.columns
                    if column not in data.columns[var_thres.get_support()]]

print(len(constant_columns))

2


In [7]:
for feature in constant_columns:
     print(feature)

C
D


In [8]:
data.drop(constant_columns, axis=1)

Unnamed: 0,A,B
0,1,4
1,2,5
2,4,6
3,1,7
4,2,8
5,4,9


Lets practise on bigger dataset : **[Open Link](https://www.kaggle.com/c/santander-customer-satisfaction/data?select=train.csv)**

In [9]:
import pandas as pd
from sklearn.feature_selection import VarianceThreshold

In [10]:
df=pd.read_csv('santander.csv', nrows=10000)

In [11]:
df

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0.0,0.00,0.00,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.170000,0
1,3,2,34,0.0,0.00,0.00,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.030000,0
2,4,2,23,0.0,0.00,0.00,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.770000,0
3,8,2,37,0.0,195.00,195.00,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64007.970000,0
4,10,2,39,0.0,0.00,0.00,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,20069,2,52,150.0,0.00,0.00,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,27745.740000,1
9996,20070,2,23,90.0,51.06,51.06,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,120468.090000,0
9997,20071,2,37,0.0,0.00,0.00,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,98462.880000,0
9998,20072,2,24,0.0,0.00,0.00,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,102070.770000,0


In [12]:
X=df.drop(labels=['TARGET'], axis=1)
y=df['TARGET']

In [13]:
from sklearn.model_selection import train_test_split

# separate dataset into train and test ...
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(labels=['TARGET'], axis=1),
    df['TARGET'],
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape

((7000, 370), (3000, 370))

### Lets apply the variance threshold

In [14]:
var_thres=VarianceThreshold(threshold=0)
var_thres.fit(X_train)

VarianceThreshold(threshold=0)

In [15]:
var_thres.get_support()

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False, False,  True,  True,  True,  True,  True, False,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False, False, False, False,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
       False, False,  True,  True,  True,  True,  True,  True,  True,
       False,  True,  True,  True, False, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False, False,  True,  True,  True,
        True,  True, False, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

In [16]:
# Finding non constant features ...
sum(var_thres.get_support())

284

In [17]:
# Lets Find non-constant features ...
len(X_train.columns[var_thres.get_support()])

284

In [18]:
constant_columns = [column for column in X_train.columns
                    if column not in X_train.columns[var_thres.get_support()]]

print(len(constant_columns))

86


In [19]:
for column in constant_columns:
    print(column)

ind_var2_0
ind_var2
ind_var13_medio_0
ind_var13_medio
ind_var18_0
ind_var18
ind_var27_0
ind_var28_0
ind_var28
ind_var27
ind_var34_0
ind_var34
ind_var41
ind_var46_0
ind_var46
num_var13_medio_0
num_var13_medio
num_var18_0
num_var18
num_var27_0
num_var28_0
num_var28
num_var27
num_var34_0
num_var34
num_var41
num_var46_0
num_var46
saldo_var13_medio
saldo_var18
saldo_var28
saldo_var27
saldo_var34
saldo_var41
saldo_var46
delta_imp_amort_var18_1y3
delta_imp_amort_var34_1y3
delta_imp_reemb_var17_1y3
delta_imp_reemb_var33_1y3
delta_imp_trasp_var17_out_1y3
delta_imp_trasp_var33_out_1y3
delta_num_reemb_var17_1y3
delta_num_reemb_var33_1y3
delta_num_trasp_var17_out_1y3
delta_num_trasp_var33_out_1y3
imp_amort_var18_hace3
imp_amort_var18_ult1
imp_amort_var34_hace3
imp_amort_var34_ult1
imp_var7_emit_ult1
imp_reemb_var13_hace3
imp_reemb_var17_hace3
imp_reemb_var17_ult1
imp_reemb_var33_hace3
imp_reemb_var33_ult1
imp_trasp_var17_in_hace3
imp_trasp_var17_out_hace3
imp_trasp_var17_out_ult1
imp_trasp_var33_i

In [20]:
from fast_ml.utilities import display_all
from fast_ml.feature_selection import get_constant_features

# Use the function to get the results in dataframe
constant_features = get_constant_features(X_train)
display_all(constant_features)

# The top 86 is simple constant and rest are quasi constants ...

Unnamed: 0,Desc,Var,Value,Perc
0,Constant,delta_imp_reemb_var17_1y3,0.0,100.0
1,Constant,num_venta_var44_hace3,0.0,100.0
2,Constant,num_reemb_var33_ult1,0.0,100.0
3,Constant,saldo_var46,0.0,100.0
4,Constant,num_var27,0.0,100.0
5,Constant,num_var28,0.0,100.0
6,Constant,num_var28_0,0.0,100.0
7,Constant,num_var27_0,0.0,100.0
8,Constant,saldo_var2_ult1,0.0,100.0
9,Constant,saldo_medio_var13_medio_hace2,0.0,100.0


In [21]:
# Drop all the constant & Quasi constant features from the dataset ...
df.drop(columns = constant_features['Var'], inplace=True)

In [22]:
df

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var41_comer_ult1,imp_op_var41_comer_ult3,imp_op_var41_efect_ult1,imp_op_var41_efect_ult3,...,saldo_medio_var12_hace2,saldo_medio_var12_hace3,saldo_medio_var12_ult1,saldo_medio_var12_ult3,saldo_medio_var13_corto_hace2,saldo_medio_var13_corto_hace3,saldo_medio_var13_corto_ult1,saldo_medio_var13_corto_ult3,var38,TARGET
0,1,2,23,0.0,0.00,0.00,0.00,0.00,0.0,0.0,...,0.0,0.0,0.00,0.00,0.0,0.00,0.0,0.00,39205.170000,0
1,3,2,34,0.0,0.00,0.00,0.00,0.00,0.0,0.0,...,0.0,0.0,0.00,0.00,300.0,122.22,300.0,240.75,49278.030000,0
2,4,2,23,0.0,0.00,0.00,0.00,0.00,0.0,0.0,...,0.0,0.0,0.00,0.00,0.0,0.00,0.0,0.00,67333.770000,0
3,8,2,37,0.0,195.00,195.00,195.00,195.00,0.0,0.0,...,0.0,0.0,0.00,0.00,0.0,0.00,0.0,0.00,64007.970000,0
4,10,2,39,0.0,0.00,0.00,0.00,0.00,0.0,0.0,...,0.0,0.0,85501.89,85501.89,0.0,0.00,0.0,0.00,117310.979016,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,20069,2,52,150.0,0.00,0.00,0.00,0.00,0.0,0.0,...,0.0,0.0,0.00,0.00,0.0,0.00,0.0,0.00,27745.740000,1
9996,20070,2,23,90.0,51.06,51.06,51.06,51.06,0.0,0.0,...,0.0,0.0,0.00,0.00,0.0,0.00,0.0,0.00,120468.090000,0
9997,20071,2,37,0.0,0.00,0.00,0.00,0.00,0.0,0.0,...,0.0,0.0,0.00,0.00,0.0,0.00,0.0,0.00,98462.880000,0
9998,20072,2,24,0.0,0.00,0.00,0.00,0.00,0.0,0.0,...,0.0,0.0,0.00,0.00,0.0,0.00,0.0,0.00,102070.770000,0


In [23]:
# Done ...