### Feature Selection- Dropping constant features
In this step we will be removing the features which have constant features which are actually not important for solving the problem statement

In [1]:
# Import pandas to create DataFrame 
import pandas as pd 
  
# Make DataFrame of the given data 
data = pd.DataFrame({"A":[1,2,4,1,2,4], 
                    "B":[4,5,6,7,8,9], 
                    "C":[0,0,0,0,0,0],
                    "D":[1,1,1,1,1,1]}) 

In [2]:
data.head()

Unnamed: 0,A,B,C,D
0,1,4,0,1
1,2,5,0,1
2,4,6,0,1
3,1,7,0,1
4,2,8,0,1


##### Variance Threshold
Feature selector that removes all low-variance features.

This feature selection algorithm looks only at the features (X), not the desired outputs (y), and can thus be used for unsupervised learning.

In [3]:
from sklearn.feature_selection import VarianceThreshold

var_tresh = VarianceThreshold(threshold=0)

var_tresh.fit(data)

VarianceThreshold(threshold=0)

In [4]:
var_tresh.get_support()

array([ True,  True, False, False])

In [5]:
data.columns[var_tresh.get_support()]

Index(['A', 'B'], dtype='object')

In [11]:
constant_columns = [column for column in data.columns if column not in data.columns[var_tresh.get_support()]]

In [12]:
constant_columns

['C', 'D']

In [14]:
data.drop(columns=constant_columns, axis = 1, inplace = True)

In [15]:
data

Unnamed: 0,A,B
0,1,4
1,2,5
2,4,6
3,1,7
4,2,8
5,4,9


# Big Data

In [21]:
df = pd.read_csv(r"C:\Users\Asus\Feature Selection\santander-customer-satisfaction\train.csv", nrows=1000)

In [22]:
df

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0,0,...,0,0,0,0,0.0,0.0,0.0,0.0,39205.170000,0
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0,0,...,0,0,0,0,0.0,0.0,0.0,0.0,49278.030000,0
2,4,2,23,0.0,0.0,0.0,0.0,0.0,0,0,...,0,0,0,0,0.0,0.0,0.0,0.0,67333.770000,0
3,8,2,37,0.0,195.0,195.0,0.0,0.0,0,0,...,0,0,0,0,0.0,0.0,0.0,0.0,64007.970000,0
4,10,2,39,0.0,0.0,0.0,0.0,0.0,0,0,...,0,0,0,0,0.0,0.0,0.0,0.0,117310.979016,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1974,2,45,0.0,0.0,0.0,0.0,0.0,0,0,...,0,0,0,0,0.0,0.0,0.0,0.0,62226.930000,1
996,1975,2,48,0.0,0.0,0.0,0.0,0.0,0,0,...,0,0,0,0,0.0,0.0,0.0,0.0,97979.910000,0
997,1976,2,32,150.0,0.0,0.0,0.0,0.0,0,0,...,0,0,0,0,0.0,0.0,0.0,0.0,58450.710000,0
998,1980,2,25,0.0,0.0,0.0,0.0,0.0,0,0,...,0,0,0,0,0.0,0.0,0.0,0.0,181089.450000,0


In [23]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.drop('TARGET', axis = 1), df['TARGET'], test_size=0.3)

In [28]:
X_train

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var29_ult3,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38
398,797,2,23,0.0,0.00,0.00,0.0,0.0,0,0,...,0.0,0,0,0,0,0.0,0.0,0.0,0.0,131846.310000
253,513,2,100,0.0,0.00,0.00,0.0,0.0,0,0,...,0.0,0,0,0,0,0.0,0.0,0.0,0.0,96617.220000
60,135,2,38,0.0,477.06,618.36,0.0,0.0,0,0,...,0.0,0,0,0,0,0.0,0.0,0.0,0.0,149904.780000
615,1218,2,32,0.0,0.00,36.00,0.0,0.0,0,0,...,0.0,0,0,0,0,0.0,0.0,0.0,0.0,164043.090000
505,1018,2,75,0.0,0.00,0.00,0.0,0.0,0,0,...,0.0,0,0,0,0,0.0,0.0,0.0,0.0,208128.960000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249,496,2,23,0.0,0.00,0.00,0.0,0.0,0,0,...,0.0,0,0,0,0,0.0,0.0,0.0,0.0,71070.300000
319,647,2,30,0.0,0.00,0.00,0.0,0.0,0,0,...,0.0,0,0,0,0,0.0,0.0,0.0,0.0,58371.540000
120,217,2,35,0.0,0.00,0.00,0.0,0.0,0,0,...,0.0,0,0,0,0,0.0,0.0,0.0,0.0,117310.979016
220,441,2,23,0.0,0.00,33.45,0.0,0.0,0,0,...,0.0,0,0,0,0,0.0,0.0,0.0,0.0,170310.930000


In [24]:
from sklearn.feature_selection import VarianceThreshold

var_tresh = VarianceThreshold(threshold=0)

var_tresh.fit(X_train)

VarianceThreshold(threshold=0)

In [25]:
var_tresh.get_support()

array([ True,  True,  True,  True,  True,  True,  True,  True, False,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False, False,  True,  True,  True,  True,  True, False,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False, False, False, False,  True,  True,
        True,  True,  True,  True,  True,  True,  True, False, False,
       False, False,  True,  True,  True,  True,  True,  True,  True,
       False,  True,  True,  True, False, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False, False,  True,  True,  True,
        True,  True, False, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
        True,  True,

In [29]:
len(X_train.columns[var_tresh.get_support()])

258

In [30]:
len(X_train.columns)

370

In [31]:
constant_columns = [column for column in X_train.columns if column not in X_train.columns[var_tresh.get_support()]]

In [33]:
len(constant_columns)

112

In [35]:
X_train.drop(columns = constant_columns, axis = 1, inplace = True)

In [36]:
X_train

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult3,imp_op_var40_ult1,...,saldo_medio_var17_hace2,saldo_medio_var17_ult1,saldo_medio_var17_ult3,saldo_medio_var29_ult1,saldo_medio_var29_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38
398,797,2,23,0.0,0.00,0.00,0.0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,131846.310000
253,513,2,100,0.0,0.00,0.00,0.0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,96617.220000
60,135,2,38,0.0,477.06,618.36,0.0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,149904.780000
615,1218,2,32,0.0,0.00,36.00,0.0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,164043.090000
505,1018,2,75,0.0,0.00,0.00,0.0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,208128.960000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249,496,2,23,0.0,0.00,0.00,0.0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,71070.300000
319,647,2,30,0.0,0.00,0.00,0.0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,58371.540000
120,217,2,35,0.0,0.00,0.00,0.0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016
220,441,2,23,0.0,0.00,33.45,0.0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,170310.930000


In [37]:
len(X_train.columns)

258