## Dropping constant features

In this step we will be removing the features which have constant features which are actually not important for solving the problem statement

In [1]:
#importing libraries to create the DataFrame
import pandas as pd

# Make DataFrame of the given data 
data = pd.DataFrame({"A":[1,2,4,1,2,4], 
                    "B":[4,5,6,7,8,9], 
                    "C":[0,0,0,0,0,0],
                    "D":[1,1,1,1,1,1]})

In [2]:
data.head()

Unnamed: 0,A,B,C,D
0,1,4,0,1
1,2,5,0,1
2,4,6,0,1
3,1,7,0,1
4,2,8,0,1


## Variance Threshold

Feature selector that removes all low-variance features.

This feature selection algorithm looks only at the features (X), not the desired outputs (y), and can thus be used for unsupervised learning.

fit(X[, y]) ->  Learn empirical variances from X.

fit_transform(X[, y]) ->  Fit to data, then transform it.

get_params([deep]) ->  Get parameters for this estimator.

get_support([indices]) -> Get a mask, or integer index, of the features selected

inverse_transform(X) -> Reverse the transformation operation

set_params(**params) -> Set the parameters of this estimator.

transform(X) -> Reduce X to the selected features.

For more info: https://bit.ly/3BdmCTH

In [3]:
#It will remove all the zero variance features
from sklearn.feature_selection import VarianceThreshold
var_thres = VarianceThreshold(threshold=0)
var_thres.fit(data)

VarianceThreshold(threshold=0)

In [4]:
var_thres.get_support()

array([ True,  True, False, False])

In [5]:
data.columns[var_thres.get_support()]

Index(['A', 'B'], dtype='object')

In [6]:
constant_columns = [column for column in data.columns
                    if column not in data.columns[var_thres.get_support()]]
print(len(constant_columns))

2


In [7]:
for feature in constant_columns:
    print(feature)

C
D


In [8]:
data.drop(constant_columns,axis=1)

Unnamed: 0,A,B
0,1,4
1,2,5
2,4,6
3,1,7
4,2,8
5,4,9


### Let's work on bigger real world dataset

In [9]:
import pandas as pd
from sklearn.feature_selection import VarianceThreshold

In [10]:
df = pd.read_csv('Santandra.csv',nrows = 7777)

In [11]:
df.head()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17,0
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.03,0
2,4,2,23,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.77,0
3,8,2,37,0.0,195.0,195.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64007.97,0
4,10,2,39,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016,0


In [12]:
df.shape

(7777, 371)

In [13]:
df.describe()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
count,7777.0,7777.0,7777.0,7777.0,7777.0,7777.0,7777.0,7777.0,7777.0,7777.0,...,7777.0,7777.0,7777.0,7777.0,7777.0,7777.0,7777.0,7777.0,7777.0,7777.0
mean,7793.105439,-640.164588,33.390896,68.628786,72.595275,125.969819,5.615813,11.3479,0.651922,0.802366,...,4.342536,1.633555,4.600315,3.577306,19.021539,3.491052,30.588096,22.967556,117412.1,0.040375
std,4511.658647,25349.419567,13.017427,739.229988,330.593509,584.481232,136.766455,247.461472,30.275218,32.879615,...,239.236567,91.147911,250.365342,189.568691,678.726243,222.570408,925.584425,706.377719,173513.5,0.196851
min,1.0,-999999.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10485.18,0.0
25%,3881.0,2.0,23.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,68184.63,0.0
50%,7791.0,2.0,28.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,107924.0,0.0
75%,11692.0,2.0,40.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,120167.8,0.0
max,15655.0,235.0,102.0,39000.0,7904.22,15182.46,7904.22,11073.57,1800.0,1800.0,...,15265.95,6643.29,15458.22,11959.17,44990.49,18928.59,47617.08,36988.2,9055272.0,1.0


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7777 entries, 0 to 7776
Columns: 371 entries, ID to TARGET
dtypes: float64(86), int64(285)
memory usage: 22.0 MB


In [15]:
X = df.drop(labels=['TARGET'], axis = 1)
Y = df['TARGET']

In [16]:
from sklearn.model_selection import train_test_split
#splitting the data into train and test 
X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=0.3,random_state=0)

X_train.shape, X_test.shape

((5443, 370), (2334, 370))

##### Let's apply the Variance Threshold method

In [17]:
var_thres = VarianceThreshold(threshold=0)
var_thres.fit(X_train)

VarianceThreshold(threshold=0)

In [18]:
var_thres.get_support()

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False, False,  True,  True, False,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False, False,  True,  True,  True,  True,  True, False,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False, False, False, False, False, False,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
       False, False,  True,  True,  True,  True,  True,  True,  True,
       False,  True,  True,  True, False, False,  True,  True,  True,
        True,  True, False, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False, False,  True,  True,  True,
        True,  True, False, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

In [19]:
sum(var_thres.get_support())

266

In [20]:
len(X_train.columns[var_thres.get_support()])

266

In [21]:
constant_columns = [column for column in X_train.columns
                    if column not in X_train.columns[var_thres.get_support()]]

print(len(constant_columns))

104


In [22]:
for column in constant_columns:
    print(column)

ind_var2_0
ind_var2
ind_var6_0
ind_var6
ind_var13_medio_0
ind_var13_medio
ind_var18_0
ind_var18
ind_var27_0
ind_var28_0
ind_var28
ind_var27
ind_var29_0
ind_var29
ind_var34_0
ind_var34
ind_var41
ind_var46_0
ind_var46
num_var6_0
num_var6
num_var13_medio_0
num_var13_medio
num_var18_0
num_var18
num_var27_0
num_var28_0
num_var28
num_var27
num_var29_0
num_var29
num_var34_0
num_var34
num_var41
num_var46_0
num_var46
saldo_var6
saldo_var13_medio
saldo_var18
saldo_var28
saldo_var27
saldo_var29
saldo_var34
saldo_var41
saldo_var46
delta_imp_amort_var18_1y3
delta_imp_amort_var34_1y3
delta_imp_reemb_var17_1y3
delta_imp_reemb_var33_1y3
delta_imp_trasp_var17_out_1y3
delta_imp_trasp_var33_in_1y3
delta_imp_trasp_var33_out_1y3
delta_num_reemb_var17_1y3
delta_num_reemb_var33_1y3
delta_num_trasp_var17_out_1y3
delta_num_trasp_var33_in_1y3
delta_num_trasp_var33_out_1y3
imp_amort_var18_hace3
imp_amort_var18_ult1
imp_amort_var34_hace3
imp_amort_var34_ult1
imp_var7_emit_ult1
imp_reemb_var13_hace3
imp_reemb_var1

In [23]:
X_train = X_train.drop(constant_columns, axis=1)

In [24]:
X_train

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var17_ult3,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38
3338,6688,2,23,0.0,0.00,0.00,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17523.78
5586,11188,2,30,0.0,0.00,0.00,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,46636.83
1206,2398,2,64,0.0,0.00,0.00,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,96978.21
4606,9220,2,64,0.0,0.00,0.00,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,236307.90
923,1843,2,33,0.0,1027.62,1027.62,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,110254.35
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4931,9862,2,55,0.0,15.06,15.06,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,104675.85
3264,6557,2,24,0.0,0.00,0.00,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,141069.33
1653,3281,2,23,0.0,0.00,0.00,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,79891.74
2607,5220,2,39,0.0,0.00,0.00,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,148284.75
