In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold

## Feature Selection- Dropping constant features
In this step we will be removing the features which have constant features which are actually not important for solving the problem statement

In [2]:
# Creating DataFrame
data = pd.DataFrame({"A":[1,4,7,8],
                    "B":[8,5,4,2],
                    "C":[0,0,0,0],
                    "D":[1,1,1,1]})
data

Unnamed: 0,A,B,C,D
0,1,8,0,1
1,4,5,0,1
2,7,4,0,1
3,8,2,0,1


## Variance Threshold
Feature selector that removes all low-variance features.

This feature selection algorithm looks only at the features (X), not the desired outputs (y), and can thus be used for unsupervised learning.

In [3]:
# It will zero variance feature
from sklearn.feature_selection import VarianceThreshold
ver_thre = VarianceThreshold(threshold=0) # we can change the threshold according to the need
ver_thre.fit(data)

In [4]:
# Check which feature is constant 
ver_thre.get_support()

array([ True,  True, False, False])

* In this output True means feature is not constant, and false means feature is constant

In [5]:
# Get all constant features
constant_columns = [column for column in data.columns
                   if column not in data.columns[ver_thre.get_support()]]

print(constant_columns,len(constant_columns))

['C', 'D'] 2


In [6]:
for feature in constant_columns:
    print(feature)

C
D


In [7]:
data.drop(constant_columns,axis=1)

Unnamed: 0,A,B
0,1,8
1,4,5
2,7,4
3,8,2


## Practice on bigger dataset

https://www.kaggle.com/c/santander-customer-satisfaction/data?select=train.csv

In [8]:
df = pd.read_csv("train.csv",nrows=1000) # Take first 1000 rows 
df.head()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0,0,...,0,0,0,0,0.0,0.0,0.0,0.0,39205.17,0
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0,0,...,0,0,0,0,0.0,0.0,0.0,0.0,49278.03,0
2,4,2,23,0.0,0.0,0.0,0.0,0.0,0,0,...,0,0,0,0,0.0,0.0,0.0,0.0,67333.77,0
3,8,2,37,0.0,195.0,195.0,0.0,0.0,0,0,...,0,0,0,0,0.0,0.0,0.0,0.0,64007.97,0
4,10,2,39,0.0,0.0,0.0,0.0,0.0,0,0,...,0,0,0,0,0.0,0.0,0.0,0.0,117310.979016,0


In [9]:
df.shape

(1000, 371)

In [10]:
### Define independant and dependant feature
X = df.drop(labels=["TARGET"],axis=1)
y = df.TARGET

In [11]:
X.head()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var29_ult3,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0,0,0,0,0.0,0.0,0.0,0.0,39205.17
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0,0,0,0,0.0,0.0,0.0,0.0,49278.03
2,4,2,23,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0,0,0,0,0.0,0.0,0.0,0.0,67333.77
3,8,2,37,0.0,195.0,195.0,0.0,0.0,0,0,...,0.0,0,0,0,0,0.0,0.0,0.0,0.0,64007.97
4,10,2,39,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0,0,0,0,0.0,0.0,0.0,0.0,117310.979016


In [12]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: TARGET, dtype: int64

In [13]:
# Split data into training and testing

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.20,random_state=45)

In [14]:
X_train.shape, X_test.shape

((800, 370), (200, 370))

In [15]:
y_train.shape, y_test.shape

((800,), (200,))

## Lets apply variance threshold

In [16]:
# Object creation
var_thres = VarianceThreshold(threshold=0)

# Fit train data
var_thres.fit(X_train) # Here we are not fit testing data because 

In [None]:
var_thres.get_support()

In [None]:
# Finding non constant feature
sum(var_thres.get_support())

In [None]:
# Finding constant feature
len(X_train.columns[var_thres.get_support()])

In [None]:
# Get all constant feature
constant_feature = [column for column in X_train.columns
                   if column not in X_train.columns[var_thres.get_support()]]

print(len(constant_feature))

In [None]:
for column in constant_feature:
    print(column)

In [None]:
# Drop constant feature
X_train.drop(constant_feature,axis=1,inplace=True)

In [None]:
X_train.shape