### Feature Selection- Dropping constant features
In this step we will be removing the features which have constant features which are actually not important
for solving the problem statement

In [1]:
# Import pandas to create DataFrame 
import pandas as pd 
  
# Make DataFrame of the given data 
data = pd.DataFrame({"A":[1,2,4,1,2,4], 
                    "B":[4,5,6,7,8,9], 
                    "C":[0,0,0,0,0,0],
                    "D":[1,1,1,1,1,1]}) 

In [None]:
data.head()

Unnamed: 0,A,B,C,D
0,1,4,0,1
1,2,5,0,1
2,4,6,0,1
3,1,7,0,1
4,2,8,0,1


##### Variance Threshold
Feature selector that removes all low-variance features.

This feature selection algorithm looks only at the features (X), not the desired outputs (y), and can thus be used for unsupervised learning.

In [None]:
### It will zero variance features
from sklearn.feature_selection import VarianceThreshold
var_thres=VarianceThreshold(threshold=0)
var_thres.fit(data)

VarianceThreshold(threshold=0)

In [None]:
var_thres.get_support()

array([ True,  True, False, False])

In [None]:
data.columns[var_thres.get_support()]

Index(['A', 'B'], dtype='object')

In [None]:
constant_columns = [column for column in data.columns
                    if column not in data.columns[var_thres.get_support()]]

print(len(constant_columns))

2


In [None]:
for feature in constant_columns:
     print(feature)

C
D


In [None]:
data.drop(constant_columns,axis=1)

Unnamed: 0,A,B
0,1,4
1,2,5
2,4,6
3,1,7
4,2,8
5,4,9


#### Lets practise on bigger dataset
https://www.kaggle.com/c/santander-customer-satisfaction/data?select=train.csv

In [None]:
import pandas as pd
from sklearn.feature_selection import VarianceThreshold

In [None]:
df=pd.read_csv('santander.csv',nrows=10000)

FileNotFoundError: ignored

In [None]:
df.shape

In [None]:
df.head()

In [None]:
X=df.drop(labels=['TARGET'], axis=1)
y=df['TARGET']

In [None]:
from sklearn.model_selection import train_test_split
# separate dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(labels=['TARGET'], axis=1),
    df['TARGET'],
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape

#### Lets apply the variance threshold

In [None]:
var_thres=VarianceThreshold(threshold=0)
var_thres.fit(X_train)

In [None]:
var_thres.get_support()

In [None]:
### Finding non constant features
sum(var_thres.get_support())

In [None]:
# Lets Find non-constant features 
len(X_train.columns[var_thres.get_support()])

In [None]:
constant_columns = [column for column in X_train.columns
                    if column not in X_train.columns[var_thres.get_support()]]

print(len(constant_columns))

In [None]:
for column in constant_columns:
    print(column)

In [None]:
X_train.drop(constant_columns,axis=1)